diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index 873aea2..fa78e62 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -23,14 +23,14 @@ jobs: - uses: actions/checkout@v4 - uses: actions-rust-lang/setup-rust-toolchain@v1 - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu riscv64gc-unknown-linux-gnu - - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu --features half - - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half + - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu + - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo +nightly build --target aarch64-unknown-linux-gnu --features nightly_f16 - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu - run: cargo build --target powerpc-unknown-linux-gnu - run: cargo build --target riscv64gc-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu - - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu - - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu + - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo +nightly build --features nightly_f16 --target x86_64-unknown-linux-gnu + - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo +nightly build --features nightly_f16 --target x86_64-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu clippy: @@ -42,7 +42,18 @@ jobs: steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - - run: cargo clippy + - run: cargo clippy -- -D warnings + + clippy_nightly: + name: Clippy Nightly + strategy: + matrix: + os: [ ubuntu-latest, macos-latest ] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - run: cargo clippy --all-features -- -D warnings tests: name: Testing @@ -71,6 +82,18 @@ jobs: - run: cargo fuzz run resize_plane -- -max_total_time=30 - run: cargo fuzz run colorspaces -- -max_total_time=10 + fuzz_rgba_8bit_no_rdm: + name: Fuzzing 8bit wo RDM + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@nightly + - run: cargo install cargo-fuzz + - run: cargo fuzz run resize_rgba --no-default-features -- -max_total_time=30 + - run: cargo fuzz run resize_rgb --no-default-features -- -max_total_time=30 + - run: cargo fuzz run resize_cbcr8 --no-default-features -- -max_total_time=30 + - run: cargo fuzz run resize_plane --no-default-features -- -max_total_time=30 + fuzz_rgba_high_bit: name: Fuzzing High bit-depth strategy: @@ -82,6 +105,7 @@ jobs: - uses: dtolnay/rust-toolchain@nightly - run: cargo install cargo-fuzz - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30 + - run: cargo fuzz run resize_rgba_f16 -- -max_total_time=30 - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30 - run: cargo fuzz run resize_cbcr16 -- -max_total_time=30 - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30 diff --git a/Cargo.lock b/Cargo.lock index 29d65f6..192a101 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -469,7 +469,6 @@ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if 1.0.0", "crunchy", - "num-traits", ] [[package]] @@ -632,12 +631,6 @@ dependencies = [ "cc", ] -[[package]] -name = "libm" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" - [[package]] name = "litrs" version = "0.4.1" @@ -767,7 +760,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -790,10 +782,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pic-scale" -version = "0.4.2" +version = "0.5.0" dependencies = [ "colorutils-rs", - "half", "libc", "num-traits", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 8e76123..4c1bf4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm", "fuzz", "app/accelerate"], exclude = ["p [package] name = "pic-scale" -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "High performance image scaling" readme = "README.md" @@ -18,7 +18,6 @@ rust-version = "1.82.0" [dependencies] colorutils-rs = {version = "0.7.4", optional = true} -half = { version = "2.4.1", optional = true, features = ["alloc", "std", "num-traits"] } num-traits = { version = "0.2.19", features = ["std"] } rayon = "1.10.0" @@ -26,7 +25,9 @@ rayon = "1.10.0" libc = "0.2.158" [features] -default = ["colorspaces"] +default = ["colorspaces", "rdm"] colorspaces = ["dep:colorutils-rs"] nightly_avx512 = [] -nightly_avx512fp16 = ["nightly_avx512"] \ No newline at end of file +nightly_avx512fp16 = ["nightly_avx512"] +nightly_f16 = [] +rdm = [] \ No newline at end of file diff --git a/app/Cargo.toml b/app/Cargo.toml index ba9a88c..182eb4b 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -5,8 +5,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } -#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half"], default-features = true } +pic-scale = { path = "..", features = ["nightly_f16"], default-features = false } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } accelerate = {path = "accelerate/"} @@ -16,7 +15,7 @@ libc = "0.2.169" criterion = "0.5.1" #image = { version = "0.25.2", features = ["default"] } fast_image_resize = { version = "5.0.0", features = [] } -pic-scale = { path = "..", features = ["half"] } +pic-scale = { path = "..", features = ["nightly_f16", "rdm"], default-features = false } [[bench]] name = "resize_rgb" diff --git a/app/accelerate/src/lib.rs b/app/accelerate/src/lib.rs index e7a46b5..138a9ef 100644 --- a/app/accelerate/src/lib.rs +++ b/app/accelerate/src/lib.rs @@ -42,6 +42,15 @@ mod accelerate { flags: libc::c_uint, ) -> libc::c_int; + #[allow(non_camel_case_types)] + #[allow(non_snake_case)] + pub fn vImageScale_XRGB2101010W( + src: *const vImage_Buffer, + dest: *mut vImage_Buffer, + temp_buffer: *mut libc::c_void, + flags: libc::c_uint, + ) -> libc::c_int; + #[allow(non_camel_case_types)] #[allow(non_snake_case)] pub fn vImageScale_ARGBFFFF( diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs index 7853693..a7ca580 100644 --- a/app/benches/resize_rgb/main.rs +++ b/app/benches/resize_rgb/main.rs @@ -5,7 +5,7 @@ use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resi use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16, - ThreadingPolicy, + ThreadingPolicy, WorkloadStrategy, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -25,6 +25,22 @@ pub fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + scaler.resize_rgb(&store, &mut target).unwrap(); + }) + }); + + c.bench_function("Pic scale RGB: Lanczos 3/Quality", |b| { + let copied: Vec = Vec::from(src_bytes); + let store = + ImageStore::::from_slice(&copied, dimensions.0 as usize, dimensions.1 as usize) + .unwrap(); + b.iter(|| { + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); let mut target = ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); scaler.resize_rgb(&store, &mut target).unwrap(); diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs index 69fbed7..cc681b1 100644 --- a/app/benches/resize_rgba/main.rs +++ b/app/benches/resize_rgba/main.rs @@ -1,11 +1,12 @@ +#![feature(f16)] use criterion::{criterion_group, criterion_main, Criterion}; use fast_image_resize::images::Image; use fast_image_resize::FilterType::Lanczos3; use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer}; use image::{GenericImageView, ImageReader}; use pic_scale::{ - ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16, - ThreadingPolicy, + Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, + ScalingF32, ScalingU16, ThreadingPolicy, WorkloadStrategy, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -16,7 +17,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { let dimensions = img.dimensions(); let src_bytes = img.as_bytes(); - c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| { + /*c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| { let copied: Vec = Vec::from(src_bytes); b.iter(|| { let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); @@ -98,8 +99,26 @@ pub fn criterion_benchmark(c: &mut Criterion) { }) }); + c.bench_function("Pic scale RGBA without alpha: Lanczos 3/Quality", |b| { + let copied: Vec = Vec::from(src_bytes); + b.iter(|| { + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); + let store = ImageStore::::from_slice( + &copied, + dimensions.0 as usize, + dimensions.1 as usize, + ) + .unwrap(); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + _ = scaler.resize_rgba(&store, &mut target, false); + }) + }); + #[cfg(any(target_os = "macos", target_os = "ios"))] - c.bench_function("Apple Accelerate: Lanczos 3", |b| { + c.bench_function("Apple Accelerate RGBA: Lanczos 3", |b| { let copied: Vec = Vec::from(src_bytes); use accelerate::{kvImageDoNotTile, vImageScale_ARGB8888, vImage_Buffer}; b.iter(|| { @@ -331,6 +350,152 @@ pub fn criterion_benchmark(c: &mut Criterion) { } }) }); + + use core::f16; + + c.bench_function("Pic scale RGBA F16 without alpha: Lanczos 3/Quality", |b| { + let copied: Vec = vec![0.; src_bytes.len()]; + b.iter(|| { + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); + let store = ImageStore::::from_slice( + &copied, + dimensions.0 as usize, + dimensions.1 as usize, + ) + .unwrap(); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + scaler.resize_rgba_f16(&store, &mut target, false).unwrap(); + }) + }); + + #[cfg(any(target_os = "macos", target_os = "ios"))] + c.bench_function("Apple Accelerate RGBAF16: Lanczos 3", |b| { + let copied: Vec = vec![0.; src_bytes.len()]; + use accelerate::{kvImageDoNotTile, vImageScale_ARGB16F, vImage_Buffer}; + b.iter(|| { + let mut target = ImageStoreMut::::alloc( + dimensions.0 as usize / 4, + dimensions.1 as usize / 4, + ); + + let src_buffer = vImage_Buffer { + data: copied.as_ptr() as *mut libc::c_void, + width: dimensions.0 as usize, + height: dimensions.1 as usize, + row_bytes: dimensions.0 as usize * 4 * std::mem::size_of::(), + }; + + let target_stride = target.stride(); + let target_ptr = target.buffer.borrow_mut().as_mut_ptr() as *mut libc::c_void; + + let mut dst_buffer = vImage_Buffer { + data: target_ptr, + width: target.width, + height: target.height, + row_bytes: target_stride * std::mem::size_of::(), + }; + + let result = unsafe { + vImageScale_ARGB16F( + &src_buffer, + &mut dst_buffer, + std::ptr::null_mut(), + kvImageDoNotTile, + ) + }; + if result != 0 { + panic!("Can't resize by accelerate"); + } + }) + });*/ + + c.bench_function("Pic scale RGBA1010102(N0: Lanczos 3/Speed", |b| { + let copied: Vec = Vec::from(src_bytes); + b.iter(|| { + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed); + + let mut dst_data_ar30 = + vec![1u8; (dimensions.0 as usize / 4) * (dimensions.1 as usize / 4) * 4]; + scaler + .resize_ar30( + &copied, + dimensions.0 as usize * 4, + ImageSize::new(dimensions.0 as usize, dimensions.1 as usize), + &mut dst_data_ar30, + (dimensions.0 as usize / 4) * 4, + ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4), + Ar30ByteOrder::Network, + ) + .unwrap(); + }) + }); + + c.bench_function("Pic scale RGBA1010102(N): Lanczos 3/Quality", |b| { + let copied: Vec = Vec::from(src_bytes); + b.iter(|| { + let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); + scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); + + let mut dst_data_ar30 = + vec![1u8; (dimensions.0 as usize / 4) * (dimensions.1 as usize / 4) * 4]; + scaler + .resize_ar30( + &copied, + dimensions.0 as usize * 4, + ImageSize::new(dimensions.0 as usize, dimensions.1 as usize), + &mut dst_data_ar30, + (dimensions.0 as usize / 4) * 4, + ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4), + Ar30ByteOrder::Network, + ) + .unwrap(); + }) + }); + + #[cfg(any(target_os = "macos", target_os = "ios"))] + c.bench_function("Apple Accelerate RGBX1010102(N): Lanczos 3", |b| { + let copied: Vec = Vec::from(src_bytes); + use accelerate::{kvImageDoNotTile, vImageScale_XRGB2101010W, vImage_Buffer}; + b.iter(|| { + let mut target = + ImageStoreMut::::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + + let src_buffer = vImage_Buffer { + data: copied.as_ptr() as *mut libc::c_void, + width: dimensions.0 as usize, + height: dimensions.1 as usize, + row_bytes: dimensions.0 as usize * 4, + }; + + let target_stride = target.stride(); + let target_ptr = target.buffer.borrow_mut().as_mut_ptr() as *mut libc::c_void; + + let mut dst_buffer = vImage_Buffer { + data: target_ptr, + width: target.width, + height: target.height, + row_bytes: target_stride, + }; + + let result = unsafe { + vImageScale_XRGB2101010W( + &src_buffer, + &mut dst_buffer, + std::ptr::null_mut(), + kvImageDoNotTile, + ) + }; + if result != 0 { + panic!("Can't resize by accelerate"); + } + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/app/src/main.rs b/app/src/main.rs index de82ee5..03d7565 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -1,17 +1,20 @@ #![feature(avx512_target_feature)] +#![feature(f16)] mod merge; mod split; use std::time::Instant; +use core::f16; use fast_image_resize::images::Image; use fast_image_resize::{ CpuExtensions, FilterType, IntoImageView, PixelType, ResizeAlg, ResizeOptions, Resizer, }; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - CbCr8ImageStore, CbCr8ImageStoreMut, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling, - ResamplingFunction, Scaler, Scaling, ScalingU16, ThreadingPolicy, + Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling, ResamplingFunction, + RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStoreMut, RgbaF16ImageStore, + RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy, WorkloadStrategy, }; fn resize_plane( @@ -48,19 +51,44 @@ fn main() { .decode() .unwrap(); let dimensions = img.dimensions(); - let transient = img.to_luma_alpha8(); + let transient = img.to_rgb8(); let mut bytes = Vec::from(transient.as_bytes()); let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); + scaler.set_workload_strategy(WorkloadStrategy::PreferQuality); // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); // let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); + let src_width = 289; + let src_height = 257; + let dst_width = 257; + let dst_height = 511; + let src_data_ar30 = vec![1u8; src_width * src_height * 4]; + let mut dst_data_ar30 = vec![1u8; dst_width * dst_height * 4]; + scaler + .resize_ar30( + &src_data_ar30, + src_width * 4, + ImageSize::new(src_width, src_height), + &mut dst_data_ar30, + dst_width * 4, + ImageSize::new(dst_width, dst_height), + Ar30ByteOrder::Host, + ) + .unwrap(); + + let rgb_feature16 = transient + .iter() + .map(|&x| (x as f32 / 255f32) as f16) + .collect::>(); + // let store = - CbCr8ImageStore::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize).unwrap(); + RgbF16ImageStore::from_slice(&rgb_feature16, dimensions.0 as usize, dimensions.1 as usize) + .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height]; @@ -74,7 +102,7 @@ fn main() { // ) // .unwrap(); - let mut dst_store = CbCr8ImageStoreMut::alloc_with_depth( + let mut dst_store = RgbF16ImageStoreMut::alloc_with_depth( dimensions.0 as usize / 4, dimensions.1 as usize / 4, 10, @@ -82,7 +110,7 @@ fn main() { // for i in 0..25 { let start_time = Instant::now(); - scaler.resize_cbcr8(&store, &mut dst_store).unwrap(); + scaler.resize_rgb_f16(&store, &mut dst_store).unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -130,7 +158,11 @@ fn main() { // .map(|&x| (x >> 2) as u8) // .collect(); - let dst = dst_store.as_bytes(); + let dst = dst_store + .as_bytes() + .iter() + .map(|&x| (x as f32 * 255.).round() as u8) + .collect::>(); // let dst = resized; // image::save_buffer( // "converted.png", @@ -152,11 +184,11 @@ fn main() { .unwrap(); } else { image::save_buffer( - "converted.webp", + "converted.png", &dst, dst_store.width as u32, dst_store.height as u32, - image::ColorType::La8, + image::ColorType::Rgb8, ) .unwrap(); } diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 973b015..1b25765 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -9,10 +9,11 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" -pic-scale = { path = "../" } +pic-scale = { path = "../", features = ["nightly_f16"], default-features = true } [features] nightly_avx512 = ["pic-scale/nightly_avx512"] +rdm = [] [[bin]] name = "resize_rgba" @@ -49,6 +50,13 @@ test = false doc = false bench = false +[[bin]] +name = "resize_rgba_f16" +path = "resize_rgba_f16/resize_rgba_f16.rs" +test = false +doc = false +bench = false + [[bin]] name = "resize_cbcr16" path = "resize_cbcr16/resize_cbcr16.rs" diff --git a/fuzz/resize_rgba_f16/resize_rgba_f16.rs b/fuzz/resize_rgba_f16/resize_rgba_f16.rs new file mode 100644 index 0000000..bcb24ad --- /dev/null +++ b/fuzz/resize_rgba_f16/resize_rgba_f16.rs @@ -0,0 +1,82 @@ +#![feature(f16)] +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#![no_main] + +use core::f16; +use libfuzzer_sys::fuzz_target; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy}; + +fuzz_target!(|data: (u16, u16, u16, u16, bool, bool)| { + let strategy = if data.5 { + WorkloadStrategy::PreferQuality + } else { + WorkloadStrategy::PreferSpeed + }; + resize_rgba( + data.0 as usize, + data.1 as usize, + data.2 as usize, + data.3 as usize, + ResamplingFunction::Lanczos3, + data.4, + strategy, + ) +}); + +fn resize_rgba( + src_width: usize, + src_height: usize, + dst_width: usize, + dst_height: usize, + sampler: ResamplingFunction, + premultiply_alpha: bool, + workload_strategy: WorkloadStrategy, +) { + if src_width == 0 + || src_width > 2000 + || src_height == 0 + || src_height > 2000 + || dst_width == 0 + || dst_width > 512 + || dst_height == 0 + || dst_height > 512 + { + return; + } + + let store = ImageStore::::alloc(src_width, src_height); + let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); + + let mut scaler = Scaler::new(sampler); + scaler.set_workload_strategy(workload_strategy); + scaler + .resize_rgba_f16(&store, &mut target, premultiply_alpha) + .unwrap(); +} diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs index 571290d..e916f24 100644 --- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs +++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs @@ -32,15 +32,23 @@ use libfuzzer_sys::fuzz_target; use pic_scale::{ Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16, + WorkloadStrategy, }; -fuzz_target!(|data: (u16, u16, u16, u16)| { +fuzz_target!(|data: (u16, u16, u16, u16, bool, bool)| { + let strategy = if data.5 { + WorkloadStrategy::PreferQuality + } else { + WorkloadStrategy::PreferSpeed + }; resize_rgba( data.0 as usize, data.1 as usize, data.2 as usize, data.3 as usize, ResamplingFunction::Lanczos3, + data.4, + strategy, ) }); @@ -50,6 +58,8 @@ fn resize_rgba( dst_width: usize, dst_height: usize, sampler: ResamplingFunction, + premultiply_alpha: bool, + workload_strategy: WorkloadStrategy, ) { if src_width == 0 || src_width > 2000 @@ -66,25 +76,27 @@ fn resize_rgba( let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); - let scaler = Scaler::new(sampler); - scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); - let store = ImageStore::::alloc(src_width, src_height); - scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); + let mut scaler = Scaler::new(sampler); + scaler.set_workload_strategy(workload_strategy); + scaler + .resize_rgba_u16(&store, &mut target, premultiply_alpha) + .unwrap(); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); let store = ImageStore::::alloc(src_width, src_height); - scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); - - let store = ImageStore::::alloc(src_width, src_height); - scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); + scaler + .resize_rgba_u16(&store, &mut target, premultiply_alpha) + .unwrap(); - let src_data_ar30 = vec![1u32; src_width * src_height]; - let mut dst_data_ar30 = vec![1u32; dst_width * dst_height]; + let src_data_ar30 = vec![1u8; src_width * src_height * 4]; + let mut dst_data_ar30 = vec![1u8; dst_width * dst_height * 4]; _ = scaler.resize_ar30( &src_data_ar30, + src_width * 4, ImageSize::new(src_width, src_height), &mut dst_data_ar30, + dst_height * 4, ImageSize::new(dst_width, dst_height), Ar30ByteOrder::Host, ); diff --git a/picscale/Cargo.lock b/picscale/Cargo.lock index 779b028..0215a43 100644 --- a/picscale/Cargo.lock +++ b/picscale/Cargo.lock @@ -65,9 +65,9 @@ checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" [[package]] name = "cbindgen" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb" +checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff" dependencies = [ "clap", "heck", @@ -220,7 +220,6 @@ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if", "crunchy", - "num-traits", ] [[package]] @@ -263,12 +262,6 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" -[[package]] -name = "libm" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" - [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -294,7 +287,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -305,10 +297,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "pic-scale" -version = "0.4.2" +version = "0.5.0" dependencies = [ "colorutils-rs", - "half", "libc", "num-traits", "rayon", @@ -319,7 +310,6 @@ name = "picscale" version = "0.1.0" dependencies = [ "cbindgen", - "half", "num-traits", "pic-scale", ] diff --git a/picscale/Cargo.toml b/picscale/Cargo.toml index 215c206..f6875ee 100644 --- a/picscale/Cargo.toml +++ b/picscale/Cargo.toml @@ -5,8 +5,7 @@ edition = "2024" build = "build.rs" [dependencies] -pic-scale = { path = "../", features = ["half"] } -half = "2.4.1" +pic-scale = { path = "../", features = ["nightly_f16"] } num-traits = "0.2.19" [features] @@ -17,7 +16,7 @@ full_support = [] crate-type = ["staticlib", "rlib"] [build-dependencies] -cbindgen = "0.27.0" +cbindgen = "0.28.0" [profile.release] strip = true diff --git a/picscale/src/lib.rs b/picscale/src/lib.rs index b8b7ffe..a835ca3 100644 --- a/picscale/src/lib.rs +++ b/picscale/src/lib.rs @@ -740,6 +740,8 @@ pub extern "C" fn pic_scale_resize_planar_f32( ) } +use core::f16; + /// Resizes an RGBAF16 image /// /// # Arguments @@ -769,12 +771,12 @@ pub extern "C" fn pic_scale_resize_rgba_f16( resizing_filter: ScalingFilter, flags: u32, ) -> usize { - pic_scale_scale_generic::( - src as *const half::f16, + pic_scale_scale_generic::( + src as *const f16, src_stride, width, height, - dst as *mut half::f16, + dst as *mut f16, dst_stride, new_width, new_height, @@ -814,12 +816,12 @@ pub extern "C" fn pic_scale_resize_rgb_f16( resizing_filter: ScalingFilter, flags: u32, ) -> usize { - pic_scale_scale_generic::( - src as *const half::f16, + pic_scale_scale_generic::( + src as *const f16, src_stride, width, height, - dst as *mut half::f16, + dst as *mut f16, dst_stride, new_width, new_height, @@ -859,12 +861,12 @@ pub extern "C" fn pic_scale_resize_cbcr_f16( resizing_filter: ScalingFilter, flags: u32, ) -> usize { - pic_scale_scale_generic::( - src as *const half::f16, + pic_scale_scale_generic::( + src as *const f16, src_stride, width, height, - dst as *mut half::f16, + dst as *mut f16, dst_stride, new_width, new_height, @@ -904,12 +906,12 @@ pub extern "C" fn pic_scale_resize_planar_f16( resizing_filter: ScalingFilter, flags: u32, ) -> usize { - pic_scale_scale_generic::( - src as *const half::f16, + pic_scale_scale_generic::( + src as *const f16, src_stride, width, height, - dst as *mut half::f16, + dst as *mut f16, dst_stride, new_width, new_height, diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs index 10dfef0..835df49 100644 --- a/src/alpha_handle_f16.rs +++ b/src/alpha_handle_f16.rs @@ -36,17 +36,18 @@ use crate::neon::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba use crate::neon::{neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16}; +use core::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; #[inline] -pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [half::f16]) { +pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [f16]) { for dst in in_place.chunks_exact_mut(4) { - let mut r = dst[0].to_f32(); - let mut g = dst[1].to_f32(); - let mut b = dst[2].to_f32(); - let a = dst[3].to_f32(); + let mut r = dst[0] as f32; + let mut g = dst[1] as f32; + let mut b = dst[2] as f32; + let a = dst[3] as f32; if a != 0. { let scale_alpha = 1. / a; r *= scale_alpha; @@ -57,33 +58,33 @@ pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [half::f16]) { g = 0.; b = 0.; } - dst[0] = half::f16::from_f32(r); - dst[1] = half::f16::from_f32(g); - dst[2] = half::f16::from_f32(b); + dst[0] = r as f16; + dst[1] = g as f16; + dst[2] = b as f16; } } #[inline] -pub(crate) fn premultiply_pixel_f16_row(dst: &mut [half::f16], src: &[half::f16]) { +pub(crate) fn premultiply_pixel_f16_row(dst: &mut [f16], src: &[f16]) { for (dst, src) in dst.chunks_exact_mut(4).zip(src.chunks_exact(4)) { - let mut r = src[0].to_f32(); - let mut g = src[1].to_f32(); - let mut b = src[2].to_f32(); - let a = src[3].to_f32(); + let mut r = src[0] as f32; + let mut g = src[1] as f32; + let mut b = src[2] as f32; + let a = src[3] as f32; r *= a; g *= a; b *= a; - dst[0] = half::f16::from_f32(r); - dst[1] = half::f16::from_f32(g); - dst[2] = half::f16::from_f32(b); - dst[3] = half::f16::from_f32(a); + dst[0] = r as f16; + dst[1] = g as f16; + dst[2] = b as f16; + dst[3] = a as f16; } } fn premultiply_alpha_rgba_impl_f16( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, _: usize, @@ -107,7 +108,7 @@ fn premultiply_alpha_rgba_impl_f16( } fn unpremultiply_alpha_rgba_impl_f16( - dst: &mut [half::f16], + dst: &mut [f16], stride: usize, width: usize, _: usize, @@ -127,18 +128,19 @@ fn unpremultiply_alpha_rgba_impl_f16( } pub(crate) fn premultiply_alpha_rgba_f16( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, height: usize, pool: &Option, ) { + #[allow(clippy::type_complexity)] let mut _dispatcher: fn( - &mut [half::f16], + &mut [f16], usize, - &[half::f16], + &[f16], usize, usize, usize, @@ -167,13 +169,13 @@ pub(crate) fn premultiply_alpha_rgba_f16( } pub(crate) fn unpremultiply_alpha_rgba_f16( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, height: usize, pool: &Option, ) { - let mut _dispatcher: fn(&mut [half::f16], usize, usize, usize, &Option) = + let mut _dispatcher: fn(&mut [f16], usize, usize, usize, &Option) = unpremultiply_alpha_rgba_impl_f16; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { diff --git a/src/ar30.rs b/src/ar30.rs index b3ccf4c..f97ecec 100644 --- a/src/ar30.rs +++ b/src/ar30.rs @@ -58,11 +58,24 @@ const fn ntohl(netlong: u32) -> u32 { } impl Rgb30 { + // #[inline] + // pub(crate) const fn pack_w_a(self, r: i32, g: i32, b: i32, a: i32) -> u32 { + // let value: u32 = match self { + // Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32, + // Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32, + // }; + // if STORE == 0 { + // value + // } else { + // htonl(value) + // } + // } + #[inline] - pub(crate) const fn pack_w_a(self, r: i32, g: i32, b: i32, a: i32) -> u32 { + pub(crate) const fn pack_w_a(self, r: i32, g: i32, b: i32, _: i32) -> u32 { let value: u32 = match self { - Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32, - Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32, + Rgb30::Ar30 => (((3 << 30) | (b << 20)) | ((g << 10) | r)) as u32, + Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | 3)) as u32, }; if STORE == 0 { value @@ -79,15 +92,15 @@ impl Rgb30 { let r10 = pixel & 0x3ff; let g10 = (pixel >> 10) & 0x3ff; let b10 = (pixel >> 20) & 0x3ff; - let a10 = pixel >> 30; - (r10, g10, b10, a10) + // let a10 = pixel >> 30; + (r10, g10, b10, 3) } Rgb30::Ra30 => { - let a2 = pixel & 0x3; + // let a2 = pixel & 0x3; let r10 = (pixel >> 22) & 0x3ff; let g10 = (pixel >> 12) & 0x3ff; let b10 = (pixel >> 2) & 0x3ff; - (r10, g10, b10, a2) + (r10, g10, b10, 3) } } } diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs index d74f2e1..9cafee2 100644 --- a/src/avx2/alpha_f16.rs +++ b/src/avx2/alpha_f16.rs @@ -29,6 +29,7 @@ use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row}; use crate::avx2::utils::{avx_combine_epi, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16}; +use core::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; @@ -38,9 +39,9 @@ use std::arch::x86::*; use std::arch::x86_64::*; pub(crate) fn avx_premultiply_alpha_rgba_f16( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, height: usize, @@ -53,7 +54,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_f16( #[target_feature(enable = "avx2", enable = "f16c")] /// This inlining is required to activate all features for runtime dispatch -unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[half::f16]) { +unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [f16], src: &[f16]) { let mut rem = dst; let mut src_rem = src; @@ -116,9 +117,9 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[ #[target_feature(enable = "avx2", enable = "f16c")] /// This inlining is required to activate all features for runtime dispatch unsafe fn avx_premultiply_alpha_rgba_f16_impl( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, _: usize, @@ -145,7 +146,7 @@ unsafe fn avx_premultiply_alpha_rgba_f16_impl( } pub(crate) fn avx_unpremultiply_alpha_rgba_f16( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, height: usize, @@ -158,7 +159,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16( #[target_feature(enable = "avx2", enable = "f16c")] /// This inlining is required to activate all features for runtime dispatch -unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { +unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [f16]) { let mut rem = in_place; for dst in rem.chunks_exact_mut(16 * 4) { @@ -243,7 +244,7 @@ unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) #[target_feature(enable = "avx2", enable = "f16c")] /// This inlining is required to activate all features for runtime dispatch unsafe fn avx_unpremultiply_alpha_rgba_f16_impl( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, _: usize, diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 48aa472..72176a1 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -27,26 +27,26 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod alpha_f16; mod alpha_f32; mod alpha_u16; mod alpha_u8; mod check_alpha; mod rgb_u8; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod rgba_f16; mod rgba_f32; mod rgba_u8_lb; pub(crate) mod utils; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod vertical_f16; mod vertical_f32; mod vertical_u16_lb; mod vertical_u8; mod vertical_u8_lp; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16}; pub(crate) use alpha_f32::avx_premultiply_alpha_rgba_f32; pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32; @@ -57,7 +57,7 @@ pub(crate) use check_alpha::{ avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8, }; pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4}; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgba_f16::{ convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16, }; @@ -67,7 +67,7 @@ pub(crate) use rgba_f32::{ pub(crate) use rgba_u8_lb::{ convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use vertical_f16::convolve_vertical_avx_row_f16; pub(crate) use vertical_f32::convolve_vertical_avx_row_f32; pub(crate) use vertical_u16_lb::convolve_column_lb_avx2_u16; diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs index b2bc4d5..67a8cd1 100644 --- a/src/avx2/rgba_f16.rs +++ b/src/avx2/rgba_f16.rs @@ -32,7 +32,7 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use half::f16; +use core::f16; use crate::avx2::utils::{_mm256_fma_ps, avx_combine_ps}; use crate::filter_weights::FilterWeights; diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs index 67f3f0d..3c2e710 100644 --- a/src/avx2/vertical_f16.rs +++ b/src/avx2/vertical_f16.rs @@ -28,6 +28,7 @@ */ use crate::avx2::utils::{_mm256_fma_ps, avx_combine_epi}; use crate::filter_weights::FilterBounds; +use core::f16; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -37,9 +38,9 @@ use std::arch::x86_64::*; unsafe fn convolve_vertical_part_avx_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -76,9 +77,9 @@ unsafe fn convolve_vertical_part_avx_f16( unsafe fn convolve_vertical_part_avx_4_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -109,9 +110,9 @@ unsafe fn convolve_vertical_part_avx_4_f16( unsafe fn convolve_vertical_part_avx_32_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -164,9 +165,9 @@ unsafe fn convolve_vertical_part_avx_32_f16( unsafe fn convolve_vertical_part_avx_16_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -204,8 +205,8 @@ unsafe fn convolve_vertical_part_avx_16_f16( pub(crate) fn convolve_vertical_avx_row_f16( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -223,8 +224,8 @@ pub(crate) fn convolve_vertical_avx_row_f16( unsafe fn convolve_vertical_avx_row_f16_regular( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -236,8 +237,8 @@ unsafe fn convolve_vertical_avx_row_f16_regular( unsafe fn convolve_vertical_avx_row_f16_fma( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -248,8 +249,8 @@ unsafe fn convolve_vertical_avx_row_f16_fma( unsafe fn convolve_vertical_avx_row_f16_impl( _: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { diff --git a/src/avx512/alpha_u8.rs b/src/avx512/alpha_u8.rs index 3f2e223..998e056 100644 --- a/src/avx512/alpha_u8.rs +++ b/src/avx512/alpha_u8.rs @@ -94,7 +94,7 @@ impl AssociateAlpha for AssociateAlphaDefault { if !rem.is_empty() { assert!(rem.len() <= 64); assert!(src_rem.len() <= 64); - self.associate_chunk(&mut rem, &src_rem); + self.associate_chunk(rem, src_rem); } } } diff --git a/src/cbcr16.rs b/src/cbcr16.rs index fd2d13d..e882d35 100644 --- a/src/cbcr16.rs +++ b/src/cbcr16.rs @@ -26,7 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; use crate::{ImageStore, ImageStoreMut}; @@ -39,6 +39,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _: ConvolutionOptions, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); } @@ -50,7 +51,8 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ) { - convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); + convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options); } } diff --git a/src/cbcr8.rs b/src/cbcr8.rs index d1be3e8..3153506 100644 --- a/src/cbcr8.rs +++ b/src/cbcr8.rs @@ -28,14 +28,12 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp}; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8, }; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::{convolve_vertical_sse_row, convolve_vertical_sse_row_lp}; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] @@ -50,6 +48,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.width as f32 / destination.width as f32; let mut _dispatcher_4_rows: Option< @@ -59,7 +58,11 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 2> { handle_fixed_row_u8::<2>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + #[cfg(feature = "rdm")] + if _scale_factor < 8. + && crate::cpu_features::is_aarch_rdm_supported() + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { use crate::neon::{ convolve_horizontal_cbcr_neon_rdm_row, convolve_horizontal_cbcr_neon_rows_rdm_4_u8, @@ -70,7 +73,10 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 2> { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if std::arch::is_x86_feature_detected!("sse4.1") && _scale_factor < 8. { + if std::arch::is_x86_feature_detected!("sse4.1") + && _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { use crate::sse::{ convolve_horizontal_cbcr_sse_hrs_row_one, convolve_horizontal_cbcr_sse_hrs_rows_4, @@ -97,6 +103,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.height as f32 / destination.height as f32; #[allow(clippy::type_complexity)] @@ -105,34 +112,56 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 2> { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { // For more downscaling better to use more precise version - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - _dispatcher = convolve_vertical_neon_i16_precision; - } else { - _dispatcher = convolve_vertical_neon_i32_precision; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::convolve_vertical_neon_i32_precision_d; + _dispatcher = convolve_vertical_neon_i32_precision_d; + } + crate::WorkloadStrategy::PreferSpeed => { + // For more downscaling better to use more precise version + #[cfg(feature = "rdm")] + if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + use crate::neon::convolve_vertical_neon_i16_precision; + _dispatcher = convolve_vertical_neon_i16_precision; + } else { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + #[cfg(not(feature = "rdm"))] + { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if is_x86_feature_detected!("sse4.1") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_sse_row_lp; } else { _dispatcher = convolve_vertical_sse_row; } } if is_x86_feature_detected!("avx2") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_avx_row_lp; } else { _dispatcher = convolve_vertical_avx_row; } } #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avx512bw") { - if _scale_factor < 8. { - use crate::avx512::convolve_vertical_avx512_row_lp; - _dispatcher = convolve_vertical_avx512_row_lp; - } + if std::arch::is_x86_feature_detected!("avx512bw") + && _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] diff --git a/src/cbcr_f32.rs b/src/cbcr_f32.rs index cbd372e..b2c72d5 100644 --- a/src/cbcr_f32.rs +++ b/src/cbcr_f32.rs @@ -28,7 +28,7 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::convolve_vertical_avx_row_f32; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::convolve_naive_f32::{ convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32, }; @@ -49,6 +49,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { let _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), @@ -72,6 +73,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/color_group.rs b/src/color_group.rs index 9f71eb6..d90de83 100644 --- a/src/color_group.rs +++ b/src/color_group.rs @@ -417,7 +417,8 @@ where macro_rules! load_ar30 { ($store: expr, $ar_type: expr, $ar_order: ty) => {{ let ar_type: crate::ar30::Rgb30 = $ar_type.into(); - let unpacked = ar_type.unpack::<$ar_order>($store[0]); + let read_bits = u32::from_ne_bytes([$store[0], $store[1], $store[2], $store[3]]); + let unpacked = ar_type.unpack::<$ar_order>(read_bits); ColorGroup::<4, i32> { r: unpacked.0 as i32, g: unpacked.1 as i32, @@ -432,7 +433,8 @@ pub(crate) use load_ar30; macro_rules! load_ar30_p { ($store: expr, $ar_type: expr, $ar_order: ty) => {{ let ar_type: crate::ar30::Rgb30 = $ar_type.into(); - let unpacked = ar_type.unpack::<$ar_order>(*$store); + let read_bits = u32::from_ne_bytes([$store[0], $store[1], $store[2], $store[3]]); + let unpacked = ar_type.unpack::<$ar_order>(read_bits); ColorGroup::<4, i32> { r: unpacked.0 as i32, g: unpacked.1 as i32, @@ -447,7 +449,10 @@ pub(crate) use load_ar30_p; macro_rules! load_ar30_with_offset { ($store: expr, $ar_type: expr, $ar_order: ty, $offset: expr) => {{ let ar_type: crate::ar30::Rgb30 = $ar_type.into(); - let unpacked = ar_type.unpack::<$ar_order>($store[$offset]); + let cn = $offset * 4; + let read_bits = + u32::from_ne_bytes([$store[cn], $store[cn + 1], $store[cn + 2], $store[cn + 3]]); + let unpacked = ar_type.unpack::<$ar_order>(read_bits); ColorGroup::<4, i32> { r: unpacked.0 as i32, g: unpacked.1 as i32, diff --git a/src/convolution.rs b/src/convolution.rs index 8c67463..b8af2fe 100644 --- a/src/convolution.rs +++ b/src/convolution.rs @@ -27,33 +27,48 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use num_traits::FromPrimitive; use rayon::ThreadPool; use std::fmt::Debug; use crate::filter_weights::FilterWeights; use crate::image_store::ImageStoreMut; +use crate::scaler::WorkloadStrategy; + +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)] +pub(crate) struct ConvolutionOptions { + pub(crate) workload_strategy: WorkloadStrategy, +} + +impl ConvolutionOptions { + pub(crate) fn new(strategy: WorkloadStrategy) -> Self { + Self { + workload_strategy: strategy, + } + } +} pub(crate) trait HorizontalConvolutionPass where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { fn convolve_horizontal( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ); } pub(crate) trait VerticalConvolutionPass where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { fn convolve_vertical( &self, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ); } diff --git a/src/cpu_features.rs b/src/cpu_features.rs index e110a04..95400df 100644 --- a/src/cpu_features.rs +++ b/src/cpu_features.rs @@ -80,7 +80,11 @@ pub(crate) fn is_aarch_f16_supported() -> bool { /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used /// otherwise consider it is always available #[allow(clippy::too_long_first_doc_paragraph)] -#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))] +#[cfg(all( + target_arch = "aarch64", + target_feature = "neon", + feature = "nightly_f16" +))] pub(crate) fn is_aarch_f16c_supported() -> bool { #[cfg(any(target_os = "macos", target_os = "ios"))] { @@ -96,6 +100,7 @@ pub(crate) fn is_aarch_f16c_supported() -> bool { /// /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[cfg(feature = "rdm")] pub(crate) fn is_aarch_rdm_supported() -> bool { #[cfg(any(target_os = "macos", target_os = "ios"))] { diff --git a/src/dispatch_group_ar30.rs b/src/dispatch_group_ar30.rs index 95de42b..406453a 100644 --- a/src/dispatch_group_ar30.rs +++ b/src/dispatch_group_ar30.rs @@ -27,15 +27,12 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::convolution::ConvolutionOptions; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::fixed_point_horizontal_ar30::{ convolve_row_handler_fixed_point_4_ar30, convolve_row_handler_fixed_point_ar30, }; use crate::fixed_point_vertical_ar30::column_handler_fixed_point_ar30; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::{ - neon_column_handler_fixed_point_ar30, neon_convolve_horizontal_rgba_rows_4_ar30, -}; use crate::support::PRECISION; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -43,14 +40,15 @@ use rayon::ThreadPool; #[allow(clippy::type_complexity)] pub(crate) fn convolve_horizontal_dispatch_ar30( - src: &[u32], + src: &[u8], src_stride: usize, filter_weights: FilterWeights, - dst: &mut [u32], + dst: &mut [u8], dst_stride: usize, pool: &Option, + _options: ConvolutionOptions, ) { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm"); if let Some(pool) = pool { pool.install(|| { @@ -58,10 +56,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30) = + let mut _dispatch: fn(&[u8], usize, &mut [u8], usize, &FilterWeights) = convolve_row_handler_fixed_point_4_ar30::; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_rdm_available { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] + if is_rdm_available + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::neon::neon_convolve_horizontal_rgba_rows_4_ar30; _dispatch = neon_convolve_horizontal_rgba_rows_4_ar30::; } @@ -85,10 +86,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30) = + let mut _dispatch: fn(&[u8], usize, &mut [u8], usize, &FilterWeights) = convolve_row_handler_fixed_point_4_ar30::; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_rdm_available { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] + if is_rdm_available + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::neon::neon_convolve_horizontal_rgba_rows_4_ar30; _dispatch = neon_convolve_horizontal_rgba_rows_4_ar30::; } _dispatch(src, src_stride, dst, dst_stride, &approx); @@ -107,14 +111,16 @@ pub(crate) fn convolve_horizontal_dispatch_ar30( - src: &[u32], + src: &[u8], src_stride: usize, filter_weights: FilterWeights, - dst: &mut [u32], + dst: &mut [u8], dst_stride: usize, pool: &Option, + width: usize, + _options: ConvolutionOptions, ) { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm"); if let Some(pool) = pool { pool.install(|| { @@ -125,13 +131,18 @@ pub(crate) fn convolve_vertical_dispatch_ar30; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_rdm_available { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] + if is_rdm_available + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::neon::neon_column_handler_fixed_point_ar30; _dispatch = neon_column_handler_fixed_point_ar30::; } + let row = &mut row[0..4 * width]; + _dispatch(&bounds, src, row, src_stride, weights); }); }); @@ -144,13 +155,18 @@ pub(crate) fn convolve_vertical_dispatch_ar30; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_rdm_available { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))] + if is_rdm_available + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::neon::neon_column_handler_fixed_point_ar30; _dispatch = neon_column_handler_fixed_point_ar30::; } + let row = &mut row[0..4 * width]; + _dispatch(&bounds, src, row, src_stride, weights); }); } diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs index ee2ab1d..e20d91a 100644 --- a/src/dispatch_group_f16.rs +++ b/src/dispatch_group_f16.rs @@ -27,25 +27,28 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter}; use crate::image_store::ImageStoreMut; use crate::ImageStore; -use half::f16; +use core::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; #[allow(clippy::type_complexity)] -pub(crate) fn convolve_vertical_dispatch_f16( +pub(crate) fn convolve_vertical_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, - dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]), + dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[V]), + weights_converter: impl WeightsConverter, ) { let src_stride = image_store.stride(); let dst_stride = destination.stride(); + let c_weights = weights_converter.prepare_weights(&filter_weights).weights; + let dst_width = destination.width; if let Some(pool) = pool { @@ -58,7 +61,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( .for_each(|(y, row)| { let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; - let weights = &filter_weights.weights[filter_offset..]; + let weights = &c_weights[filter_offset..]; let source_buffer = image_store.buffer.as_ref(); dispatcher( dst_width, @@ -79,7 +82,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( .for_each(|(y, row)| { let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; - let weights = &filter_weights.weights[filter_offset..]; + let weights = &c_weights[filter_offset..]; let source_buffer = image_store.buffer.as_ref(); dispatcher( dst_width, @@ -94,21 +97,24 @@ pub(crate) fn convolve_vertical_dispatch_f16( } #[allow(clippy::type_complexity)] -pub(crate) fn convolve_horizontal_dispatch_f16( +pub(crate) fn convolve_horizontal_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), + fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), >, - dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]), + dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]), + weights_converter: impl WeightsConverter, ) { let src_stride = image_store.stride(); let dst_stride = destination.stride(); let dst_width = destination.width; let src_width = image_store.width; + let c_weights = weights_converter.prepare_weights(&filter_weights); + if let Some(pool) = pool { pool.install(|| { let mut processed_4 = false; @@ -126,13 +132,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( ) .for_each(|(src, dst)| { dispatcher( - dst_width, - src_width, - &filter_weights, - src, - src_stride, - dst, - dst_stride, + dst_width, src_width, &c_weights, src, src_stride, dst, dst_stride, ); }); processed_4 = true; @@ -161,7 +161,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( .par_chunks_exact(src_stride) .zip(left_dst_rows.par_chunks_exact_mut(dst_stride)) .for_each(|(src, dst)| { - dispatcher_row(dst_width, src_width, &filter_weights, src, dst); + dispatcher_row(dst_width, src_width, &c_weights, src, dst); }); }); } else { @@ -179,13 +179,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( ) { dispatcher( - dst_width, - src_width, - &filter_weights, - src, - src_stride, - dst, - dst_stride, + dst_width, src_width, &c_weights, src, src_stride, dst, dst_stride, ); } processed_4 = true; @@ -213,7 +207,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( .chunks_exact(src_stride) .zip(left_dst_rows.chunks_exact_mut(dst_stride)) { - dispatcher_row(dst_width, src_width, &filter_weights, src, dst); + dispatcher_row(dst_width, src_width, &c_weights, src, dst); } } } diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs index 4eeb1b4..fb2ada7 100644 --- a/src/dispatch_group_u16.rs +++ b/src/dispatch_group_u16.rs @@ -27,6 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::convolution::ConvolutionOptions; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::cpu_features::is_aarch_f16_supported; use crate::filter_weights::{ @@ -163,6 +164,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( filter_weights: FilterWeights, destination: &mut ImageStoreMut<'_, u16, COMPONENTS>, pool: &Option, + _options: ConvolutionOptions, ) { let src_stride = image_store.stride(); let dst_stride = destination.stride(); @@ -195,8 +197,10 @@ pub(crate) fn convolve_vertical_dispatch_u16( } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - if is_aarch_f16_supported() { - use crate::filter_weights::WeightFloat16Converter; + if is_aarch_f16_supported() + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::filter_weights::WeightFloat16ConverterCast; execute_low_precision_row( true, image_store, @@ -207,7 +211,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( dst_width, destination_image, HighBitDepthFloat16LowerHandler::default(), - WeightFloat16Converter::default(), + WeightFloat16ConverterCast::default(), ); } else { execute_low_precision_row( @@ -262,8 +266,10 @@ pub(crate) fn convolve_vertical_dispatch_u16( } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - if is_aarch_f16_supported() { - use crate::filter_weights::WeightFloat16Converter; + if is_aarch_f16_supported() + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::filter_weights::WeightFloat16ConverterCast; execute_low_precision_row( false, image_store, @@ -274,7 +280,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( dst_width, destination.buffer.borrow_mut(), HighBitDepthFloat16LowerHandler::default(), - WeightFloat16Converter::default(), + WeightFloat16ConverterCast::default(), ); } else { execute_low_precision_row( diff --git a/src/f16.rs b/src/f16.rs index 00ba6e1..e43bb1e 100644 --- a/src/f16.rs +++ b/src/f16.rs @@ -34,11 +34,11 @@ use crate::avx2::{ convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16, convolve_vertical_avx_row_f16, }; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported}; use crate::dispatch_group_f16::{convolve_horizontal_dispatch_f16, convolve_vertical_dispatch_f16}; -use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::filter_weights::{FilterBounds, FilterWeights, PasshroughWeightsConverter}; use crate::floating_point_horizontal::{ convolve_row_handler_floating_point, convolve_row_handler_floating_point_4, }; @@ -63,7 +63,7 @@ use crate::sse::{ convolve_vertical_sse_row_f16, }; use crate::ImageStore; -use half::f16; +use core::{f16, f32}; use rayon::ThreadPool; fn convolve_horizontal_rgba_4_row_f16( @@ -75,14 +75,19 @@ fn convolve_horizontal_rgba_4_row_f16( dst: &mut [f16], dst_stride: usize, ) { - convolve_row_handler_floating_point_4::( - src, + let transient_src = src.iter().map(|&x| x as f32).collect::>(); + let mut transient_dst = vec![0f32; dst.len()]; + convolve_row_handler_floating_point_4::( + &transient_src, src_stride, - dst, + &mut transient_dst, dst_stride, filter_weights, 8, - ) + ); + for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) { + *dst = *src as f16; + } } fn convolve_horizontal_rgb_native_row_f16( @@ -92,7 +97,17 @@ fn convolve_horizontal_rgb_native_row_f16( src: &[f16], dst: &mut [f16], ) { - convolve_row_handler_floating_point::(src, dst, filter_weights, 8) + let transient_src = src.iter().map(|&x| x as f32).collect::>(); + let mut transient_dst = vec![0f32; dst.len()]; + convolve_row_handler_floating_point::( + &transient_src, + &mut transient_dst, + filter_weights, + 8, + ); + for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) { + *dst = *src as f16; + } } impl HorizontalConvolutionPass for ImageStore<'_, f16, 4> { @@ -101,6 +116,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher_4_rows: Option< @@ -114,9 +130,31 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 4> { if is_aarch_f16c_supported() { _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_f16); _dispatcher_row = convolve_horizontal_rgba_neon_row_one_f16; - if is_aarch_f16_supported() { - _dispatcher_4_rows = Some(xconvolve_horizontal_rgba_neon_rows_4_f16); - _dispatcher_row = xconvolve_horizontal_rgba_neon_row_one_f16; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() { + _dispatcher_4_rows = Some(xconvolve_horizontal_rgba_neon_rows_4_f16); + _dispatcher_row = xconvolve_horizontal_rgba_neon_row_one_f16; + } + } + crate::WorkloadStrategy::PreferQuality => { + if std::arch::is_aarch64_feature_detected!("fhm") { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::{ + convolve_horizontal_rgba_neon_row_one_f16_fhm, + convolve_horizontal_rgba_neon_rows_4_f16_fhm, + }; + return convolve_horizontal_dispatch_f16( + self, + filter_weights, + destination, + pool, + Some(convolve_horizontal_rgba_neon_rows_4_f16_fhm), + convolve_horizontal_rgba_neon_row_one_f16_fhm, + WeightFloat16Converter::default(), + ); + } + } } } } @@ -154,6 +192,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 4> { pool, _dispatcher_4_rows, _dispatcher_row, + PasshroughWeightsConverter::default(), ); } } @@ -166,7 +205,19 @@ fn convolve_vertical_rgb_native_row_f16( src_stride: usize, weight: &[f32], ) { - column_handler_floating_point::(bounds, src, dst, src_stride, weight, 8); + let transient_src = src.iter().map(|&x| x as f32).collect::>(); + let mut transient_dst = vec![0f32; dst.len()]; + column_handler_floating_point::( + bounds, + &transient_src, + &mut transient_dst, + src_stride, + weight, + 8, + ); + for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) { + *dst = *src as f16; + } } impl VerticalConvolutionPass for ImageStore<'_, f16, 4> { @@ -175,6 +226,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -183,8 +235,26 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 4> { { if is_aarch_f16c_supported() { _dispatcher = convolve_vertical_rgb_neon_row_f16; - if is_aarch_f16_supported() { - _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm; + if std::arch::is_aarch64_feature_detected!("fhm") { + return convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + convolve_vertical_rgb_neon_row_f16_fhm, + WeightFloat16Converter {}, + ); + } + } + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() { + _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + } + } } } } @@ -210,7 +280,14 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 4> { } } } - convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + _dispatcher, + PasshroughWeightsConverter {}, + ); } } @@ -220,6 +297,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher_4_rows: Option< @@ -233,9 +311,33 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 3> { if is_aarch_f16c_supported() { _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f16); _dispatcher_row = convolve_horizontal_rgb_neon_row_one_f16; - if is_aarch_f16_supported() { - _dispatcher_4_rows = Some(xconvolve_horizontal_rgb_neon_rows_4_f16); - _dispatcher_row = xconvolve_horizontal_rgb_neon_row_one_f16; + } + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + if std::arch::is_aarch64_feature_detected!("fhm") { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::{ + convolve_horizontal_rgb_neon_row_one_f16_fhm, + convolve_horizontal_rgb_neon_rows_4_f16_fhm, + }; + return convolve_horizontal_dispatch_f16( + self, + filter_weights, + destination, + pool, + Some(convolve_horizontal_rgb_neon_rows_4_f16_fhm), + convolve_horizontal_rgb_neon_row_one_f16_fhm, + WeightFloat16Converter::default(), + ); + } + } + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + _dispatcher_4_rows = Some(xconvolve_horizontal_rgb_neon_rows_4_f16); + _dispatcher_row = xconvolve_horizontal_rgb_neon_row_one_f16; + } } } } @@ -264,6 +366,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 3> { pool, _dispatcher_4_rows, _dispatcher_row, + PasshroughWeightsConverter::default(), ); } } @@ -274,6 +377,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -282,8 +386,26 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 3> { { if is_aarch_f16c_supported() { _dispatcher = convolve_vertical_rgb_neon_row_f16; - if is_aarch_f16_supported() { - _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm; + if std::arch::is_aarch64_feature_detected!("fhm") { + return convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + convolve_vertical_rgb_neon_row_f16_fhm, + WeightFloat16Converter {}, + ); + } + } + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() { + _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + } + } } } } @@ -309,7 +431,14 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 3> { } } } - convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + _dispatcher, + PasshroughWeightsConverter::default(), + ); } } @@ -319,6 +448,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let _dispatcher_4_rows: Option< @@ -333,6 +463,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 1> { pool, _dispatcher_4_rows, _dispatcher_row, + PasshroughWeightsConverter::default(), ); } } @@ -343,6 +474,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -351,8 +483,26 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 1> { { if is_aarch_f16c_supported() { _dispatcher = convolve_vertical_rgb_neon_row_f16; - if is_aarch_f16_supported() { - _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm; + if std::arch::is_aarch64_feature_detected!("fhm") { + return convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + convolve_vertical_rgb_neon_row_f16_fhm, + WeightFloat16Converter {}, + ); + } + } + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() { + _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + } + } } } } @@ -377,7 +527,14 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 1> { } } } - convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + _dispatcher, + PasshroughWeightsConverter::default(), + ); } } @@ -387,6 +544,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let _dispatcher_4_rows: Option< @@ -401,6 +559,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f16, 2> { pool, _dispatcher_4_rows, _dispatcher_row, + PasshroughWeightsConverter::default(), ); } } @@ -411,6 +570,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 2> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -419,8 +579,26 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 2> { { if is_aarch_f16c_supported() { _dispatcher = convolve_vertical_rgb_neon_row_f16; - if is_aarch_f16_supported() { - _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::filter_weights::WeightFloat16Converter; + use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm; + if std::arch::is_aarch64_feature_detected!("fhm") { + return convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + convolve_vertical_rgb_neon_row_f16_fhm, + WeightFloat16Converter {}, + ); + } + } + crate::WorkloadStrategy::PreferSpeed => { + if is_aarch_f16_supported() { + _dispatcher = xconvolve_vertical_rgb_neon_row_f16; + } + } } } } @@ -445,6 +623,13 @@ impl VerticalConvolutionPass for ImageStore<'_, f16, 2> { } } } - convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_f16( + self, + filter_weights, + destination, + pool, + _dispatcher, + PasshroughWeightsConverter::default(), + ); } } diff --git a/src/filter_weights.rs b/src/filter_weights.rs index a665d96..be308e8 100644 --- a/src/filter_weights.rs +++ b/src/filter_weights.rs @@ -149,12 +149,23 @@ where } } +#[derive(Default)] +#[cfg(feature = "nightly_f16")] +pub(crate) struct PasshroughWeightsConverter {} + +#[cfg(feature = "nightly_f16")] +impl WeightsConverter for PasshroughWeightsConverter { + fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { + weights.clone() + } +} + #[derive(Default)] #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -pub(crate) struct WeightFloat16Converter {} +pub(crate) struct WeightFloat16ConverterCast {} #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -impl WeightsConverter for WeightFloat16Converter { +impl WeightsConverter for WeightFloat16ConverterCast { fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { use crate::neon::convert_weights_to_f16; let converted_weights = convert_weights_to_f16(&weights.weights); @@ -171,3 +182,32 @@ impl WeightsConverter for WeightFloat16Converter { ) } } + +#[derive(Default)] +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[cfg(feature = "nightly_f16")] +pub(crate) struct WeightFloat16Converter {} + +#[cfg(feature = "nightly_f16")] +#[allow(unused)] +use core::f16; + +#[cfg(feature = "nightly_f16")] +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +impl WeightsConverter for WeightFloat16Converter { + fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { + use crate::neon::convert_weights_to_f16_fhm; + let converted_weights = convert_weights_to_f16_fhm(&weights.weights); + + let new_bounds = weights.bounds.to_vec(); + + FilterWeights::new( + converted_weights, + weights.kernel_size, + weights.kernel_size, + weights.distinct_elements, + weights.coeffs_size, + new_bounds, + ) + } +} diff --git a/src/fixed_point_horizontal_ar30.rs b/src/fixed_point_horizontal_ar30.rs index b46a7cb..8b4b6be 100644 --- a/src/fixed_point_horizontal_ar30.rs +++ b/src/fixed_point_horizontal_ar30.rs @@ -37,31 +37,36 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30< const AR30_TYPE: usize, const AR30_ORDER: usize, >( - src: &[u32], - dst: &mut [u32], + src: &[u8], + dst: &mut [u8], filter_weights: &FilterWeights, ) { - for ((chunk, &bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) { + for ((chunk, &bounds), weights) in dst + .chunks_exact_mut(4) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { let mut sums = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_()); let start_x = bounds.start; let bounds_size = bounds.size; - let px = start_x; + const CN: usize = 4; + let px = start_x * CN; if bounds_size == 2 { - let src_ptr0 = &src[px..(px + 2)]; + let src_ptr0 = &src[px..(px + 2 * CN)]; let sliced_weights = &weights[0..2]; let weight0 = sliced_weights[0] as i32; let weight1 = sliced_weights[1] as i32; sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1; } else if bounds_size == 3 { - let src_ptr0 = &src[px..(px + 3)]; + let src_ptr0 = &src[px..(px + 3 * CN)]; let sliced_weights = &weights[0..3]; let weight0 = sliced_weights[0] as i32; let weight1 = sliced_weights[1] as i32; @@ -70,7 +75,7 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30< + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2; } else if bounds_size == 4 { - let src_ptr0 = &src[px..(px + 4)]; + let src_ptr0 = &src[px..(px + 4 * CN)]; let sliced_weights = &weights[0..4]; let weight0 = sliced_weights[0] as i32; let weight1 = sliced_weights[1] as i32; @@ -81,7 +86,7 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30< + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3; } else if bounds_size == 6 { - let src_ptr0 = &src[px..(px + 6)]; + let src_ptr0 = &src[px..(px + 6 * CN)]; let sliced_weights = &weights[0..6]; let weight0 = sliced_weights[0] as i32; @@ -98,7 +103,11 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30< + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5; } else { let src_ptr0 = &src[px..(px + bounds_size)]; - for (&k_weight, src) in weights.iter().zip(src_ptr0.iter()).take(bounds.size) { + for (&k_weight, src) in weights + .iter() + .zip(src_ptr0.chunks_exact(4)) + .take(bounds.size) + { let weight: i32 = k_weight as i32; let new_px = load_ar30_p!(src, AR30_TYPE, AR30_ORDER); sums += new_px * weight; @@ -106,7 +115,11 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30< } let narrowed = sums.saturate_ar30(); - *chunk = narrowed.to_ar30::(); + let bytes0 = narrowed.to_ar30::().to_ne_bytes(); + chunk[0] = bytes0[0]; + chunk[1] = bytes0[1]; + chunk[2] = bytes0[2]; + chunk[3] = bytes0[3]; } } @@ -115,9 +128,9 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< const AR30_TYPE: usize, const AR30_ORDER: usize, >( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, ) { @@ -125,10 +138,12 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< let (row1_ref, rest) = rest.split_at_mut(dst_stride); let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - let iter_row0 = row0_ref.iter_mut(); - let iter_row1 = row1_ref.iter_mut(); - let iter_row2 = row2_ref.iter_mut(); - let iter_row3 = row3_ref.iter_mut(); + const CN: usize = 4; + + let iter_row0 = row0_ref.chunks_exact_mut(CN); + let iter_row1 = row1_ref.chunks_exact_mut(CN); + let iter_row2 = row2_ref.chunks_exact_mut(CN); + let iter_row3 = row3_ref.chunks_exact_mut(CN); for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 .zip(iter_row1) @@ -148,14 +163,14 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< let start_x = bounds.start; - let px = start_x; + let px = start_x * CN; let bounds_size = bounds.size; if bounds_size == 2 { - let src_ptr0 = &src[px..(px + 2)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2)]; + let src_ptr0 = &src[px..(px + 2 * CN)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * 4)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * 4)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * 4)]; let sliced_weights = &weights[0..2]; let weight0 = sliced_weights[0] as i32; @@ -169,10 +184,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1; } else if bounds_size == 3 { - let src_ptr0 = &src[px..(px + 3)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3)]; + let src_ptr0 = &src[px..(px + 3 * CN)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * 4)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * 4)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * 4)]; let sliced_weights = &weights[0..3]; let weight0 = sliced_weights[0] as i32; @@ -191,10 +206,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2; } else if bounds_size == 4 { - let src_ptr0 = &src[px..(px + 4)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4)]; + let src_ptr0 = &src[px..(px + 4 * CN)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * 4)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * 4)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * 4)]; let sliced_weights = &weights[0..4]; let weight0 = sliced_weights[0] as i32; @@ -218,10 +233,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3; } else if bounds_size == 6 { - let src_ptr0 = &src[px..(px + 6)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6)]; + let src_ptr0 = &src[px..(px + 6 * CN)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * 4)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * 4)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * 4)]; let sliced_weights = &weights[0..6]; let weight0 = sliced_weights[0] as i32; @@ -255,17 +270,17 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 4) * weight4 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 5) * weight5; } else { - let src_ptr0 = &src[px..(px + bounds_size)]; - let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size)]; - let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size)]; - let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size)]; + let src_ptr0 = &src[px..(px + bounds_size * CN)]; + let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CN)]; + let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size * CN)]; + let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size * CN)]; for ((((&k_weight, src0), src1), src2), src3) in weights .iter() - .zip(src_ptr0.iter()) - .zip(src_ptr1.iter()) - .zip(src_ptr2.iter()) - .zip(src_ptr3.iter()) + .zip(src_ptr0.chunks_exact(4)) + .zip(src_ptr1.chunks_exact(4)) + .zip(src_ptr2.chunks_exact(4)) + .zip(src_ptr3.chunks_exact(4)) .take(bounds.size) { let weight: i32 = k_weight as i32; @@ -287,9 +302,28 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30< let narrowed2 = sums2.saturate_ar30(); let narrowed3 = sums3.saturate_ar30(); - *chunk0 = narrowed0.to_ar30::(); - *chunk1 = narrowed1.to_ar30::(); - *chunk2 = narrowed2.to_ar30::(); - *chunk3 = narrowed3.to_ar30::(); + let bytes0 = narrowed0.to_ar30::().to_ne_bytes(); + chunk0[0] = bytes0[0]; + chunk0[1] = bytes0[1]; + chunk0[2] = bytes0[2]; + chunk0[3] = bytes0[3]; + + let bytes1 = narrowed1.to_ar30::().to_ne_bytes(); + chunk1[0] = bytes1[0]; + chunk1[1] = bytes1[1]; + chunk1[2] = bytes1[2]; + chunk1[3] = bytes1[3]; + + let bytes2 = narrowed2.to_ar30::().to_ne_bytes(); + chunk2[0] = bytes2[0]; + chunk2[1] = bytes2[1]; + chunk2[2] = bytes2[2]; + chunk2[3] = bytes2[3]; + + let bytes3 = narrowed3.to_ar30::().to_ne_bytes(); + chunk3[0] = bytes3[0]; + chunk3[1] = bytes3[1]; + chunk3[2] = bytes3[2]; + chunk3[3] = bytes3[3]; } } diff --git a/src/fixed_point_vertical_ar30.rs b/src/fixed_point_vertical_ar30.rs index 86a84f7..26aa32c 100644 --- a/src/fixed_point_vertical_ar30.rs +++ b/src/fixed_point_vertical_ar30.rs @@ -39,9 +39,9 @@ pub(crate) fn convolve_column_handler_fip_db_ar30< const AR30_ORDER: usize, const BUFFER_SIZE: usize, >( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], filter: &[i16], bounds: &FilterBounds, x: usize, @@ -56,10 +56,10 @@ pub(crate) fn convolve_column_handler_fip_db_ar30< let py = bounds.start; let weight = filter[0] as i32; - let offset = src_stride * py + v_start_px; - let src_ptr = &src[offset..(offset + BUFFER_SIZE)]; + let offset = src_stride * py + v_start_px * 4; + let src_ptr = &src[offset..(offset + BUFFER_SIZE * 4)]; - for (dst, src) in direct_store.iter_mut().zip(src_ptr) { + for (dst, src) in direct_store.iter_mut().zip(src_ptr.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } @@ -67,18 +67,24 @@ pub(crate) fn convolve_column_handler_fip_db_ar30< // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = k_weight as i32; - let offset = src_stride * py + v_start_px; - let src_ptr = &src[offset..(offset + BUFFER_SIZE)]; + let offset = src_stride * py + v_start_px * 4; + let src_ptr = &src[offset..(offset + BUFFER_SIZE * 4)]; - for (dst, src) in direct_store.iter_mut().zip(src_ptr.iter()) { + for (dst, src) in direct_store.iter_mut().zip(src_ptr.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } } - let v_dst = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)]; - for (dst, src) in v_dst.iter_mut().zip(direct_store) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst = &mut dst[v_start_px * 4..(v_start_px * 4 + BUFFER_SIZE * 4)]; + for (dst, src) in v_dst.chunks_exact_mut(4).zip(direct_store) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } } @@ -91,9 +97,9 @@ fn convolve_column_handler_fixed_point_direct_buffer_double< const AR30_ORDER: usize, const BUFFER_SIZE: usize, >( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], filter: &[i16], bounds: &FilterBounds, x: usize, @@ -110,15 +116,15 @@ fn convolve_column_handler_fixed_point_direct_buffer_double< let py = bounds.start; let weight = filter[0] as i32; - let offset = src_stride * py + v_start_px; - let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; - let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)]; + let offset = src_stride * py + v_start_px * 4; + let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)]; + let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)]; - for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) { + for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) { + for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } @@ -126,28 +132,41 @@ fn convolve_column_handler_fixed_point_direct_buffer_double< // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = k_weight as i32; - let offset = src_stride * py + v_start_px; - let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; - let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)]; + let offset = src_stride * py + v_start_px * 4; + let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)]; + let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)]; - for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) { + for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) { + for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } } - let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)]; - for (dst, src) in v_dst0.iter_mut().zip(direct_store0) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst0 = &mut dst[v_start_px * 4..(v_start_px * 4 + BUFFER_SIZE * 4)]; + for (dst, src) in v_dst0.chunks_exact_mut(4).zip(direct_store0) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } - let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)]; - for (dst, src) in v_dst1.iter_mut().zip(direct_store1) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst1 = + &mut dst[(v_start_px * 4 + BUFFER_SIZE * 4)..(v_start_px * 4 + BUFFER_SIZE * 2 * 4)]; + for (dst, src) in v_dst1.chunks_exact_mut(4).zip(direct_store1) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } } @@ -160,9 +179,9 @@ fn convolve_column_handler_fixed_point_direct_buffer_four< const AR30_ORDER: usize, const BUFFER_SIZE: usize, >( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], filter: &[i16], bounds: &FilterBounds, x: usize, @@ -179,29 +198,29 @@ fn convolve_column_handler_fixed_point_direct_buffer_four< let mut direct_store3: [ColorGroup<4, i32>; BUFFER_SIZE] = [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE]; - let v_start_px = x; + let v_start_px = x * 4; let py = bounds.start; let weight = filter[0] as i32; let offset = src_stride * py + v_start_px; - let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; - let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)]; - let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)]; - let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)]; + let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)]; + let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)]; + let src_ptr2 = &src[(offset + BUFFER_SIZE * 2 * 4)..(offset + BUFFER_SIZE * 3 * 4)]; + let src_ptr3 = &src[(offset + BUFFER_SIZE * 3 * 4)..(offset + BUFFER_SIZE * 4 * 4)]; - for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) { + for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) { + for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store2.iter_mut().zip(src_ptr2) { + for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store3.iter_mut().zip(src_ptr3) { + for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } @@ -210,60 +229,84 @@ fn convolve_column_handler_fixed_point_direct_buffer_four< let py = bounds.start + j + 1; let weight = k_weight as i32; let offset = src_stride * py + v_start_px; - let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)]; - let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)]; - let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)]; - let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)]; + let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)]; + let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)]; + let src_ptr2 = &src[(offset + BUFFER_SIZE * 2 * 4)..(offset + BUFFER_SIZE * 3 * 4)]; + let src_ptr3 = &src[(offset + BUFFER_SIZE * 3 * 4)..(offset + BUFFER_SIZE * 4 * 4)]; - for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) { + for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) { + for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.iter()) { + for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } - for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.iter()) { + for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.chunks_exact(4)) { *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight; } } - let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)]; - for (dst, src) in v_dst0.iter_mut().zip(direct_store0) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE * 4)]; + for (dst, src) in v_dst0.chunks_exact_mut(4).zip(direct_store0) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } - let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)]; - for (dst, src) in v_dst1.iter_mut().zip(direct_store1) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE * 4)..(v_start_px + BUFFER_SIZE * 2 * 4)]; + for (dst, src) in v_dst1.chunks_exact_mut(4).zip(direct_store1) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } - let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2)..(v_start_px + BUFFER_SIZE * 3)]; - for (dst, src) in v_dst2.iter_mut().zip(direct_store2) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2 * 4)..(v_start_px + BUFFER_SIZE * 3 * 4)]; + for (dst, src) in v_dst2.chunks_exact_mut(4).zip(direct_store2) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } - let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3)..(v_start_px + BUFFER_SIZE * 4)]; - for (dst, src) in v_dst3.iter_mut().zip(direct_store3) { - let saturated = src.saturate_ar30().to_ar30::(); - *dst = saturated; + let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3 * 4)..(v_start_px + BUFFER_SIZE * 4 * 4)]; + for (dst, src) in v_dst3.chunks_exact_mut(4).zip(direct_store3) { + let saturated = src + .saturate_ar30() + .to_ar30::() + .to_ne_bytes(); + dst[0] = saturated[0]; + dst[1] = saturated[1]; + dst[2] = saturated[2]; + dst[3] = saturated[3]; } } pub(crate) fn column_handler_fixed_point_ar30( bounds: &FilterBounds, - src: &[u32], - dst: &mut [u32], + src: &[u8], + dst: &mut [u8], src_stride: usize, weight: &[i16], ) { let mut cx = 0usize; - let total_width = dst.len(); + let total_width = dst.len() / 4; while cx + 64 < total_width { convolve_column_handler_fixed_point_direct_buffer_four::( diff --git a/src/image_store.rs b/src/image_store.rs index 17fff1a..4f9c2b5 100644 --- a/src/image_store.rs +++ b/src/image_store.rs @@ -27,14 +27,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::alpha_check::has_non_constant_cap_alpha_rgba_f32; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16}; use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32}; use crate::alpha_handle_u16::{premultiply_alpha_rgba_u16, unpremultiply_alpha_rgba_u16}; use crate::alpha_handle_u8::{premultiply_alpha_rgba, unpremultiply_alpha_rgba}; use crate::pic_scale_error::{PicScaleBufferMismatch, PicScaleError}; use crate::ImageSize; -use num_traits::FromPrimitive; +#[cfg(feature = "nightly_f16")] +use core::f16; use rayon::ThreadPool; use std::borrow::Cow; use std::fmt::Debug; @@ -51,7 +52,7 @@ use std::fmt::Debug; #[derive(Debug, Clone)] pub struct ImageStore<'a, T, const N: usize> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { pub buffer: std::borrow::Cow<'a, [T]>, /// Channels in the image @@ -78,7 +79,7 @@ where #[derive(Debug)] pub struct ImageStoreMut<'a, T, const N: usize> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { pub buffer: BufferStore<'a, T>, /// Channels in the image @@ -123,7 +124,7 @@ impl BufferStore<'_, T> { impl<'a, T, const N: usize> ImageStore<'a, T, N> where - T: FromPrimitive + Clone + Copy + Debug + Default, + T: Clone + Copy + Debug + Default, { pub fn new( slice_ref: Vec, @@ -151,7 +152,7 @@ where } pub fn alloc(width: usize, height: usize) -> ImageStore<'a, T, N> { - let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; + let vc = vec![T::default(); width * N * height]; ImageStore:: { buffer: std::borrow::Cow::Owned(vc), channels: N, @@ -175,8 +176,8 @@ impl CheckStoreDensity for ImageStoreMut<'_, f32, N> { } } -#[cfg(feature = "half")] -impl CheckStoreDensity for ImageStoreMut<'_, half::f16, N> { +#[cfg(feature = "nightly_f16")] +impl CheckStoreDensity for ImageStoreMut<'_, f16, N> { fn should_have_bit_depth(&self) -> bool { false } @@ -190,7 +191,7 @@ impl CheckStoreDensity for ImageStoreMut<'_, u16, N> { impl ImageStoreMut<'_, T, N> where - T: FromPrimitive + Clone + Copy + Debug + Default, + T: Clone + Copy + Debug + Default, { pub(crate) fn validate(&self) -> Result<(), PicScaleError> { let expected_size = self.stride() * self.height; @@ -212,7 +213,7 @@ where impl ImageStore<'_, T, N> where - T: FromPrimitive + Clone + Copy + Debug + Default, + T: Clone + Copy + Debug + Default, { pub(crate) fn validate(&self) -> Result<(), PicScaleError> { let expected_size = self.stride() * self.height; @@ -234,7 +235,7 @@ where impl<'a, T, const N: usize> ImageStoreMut<'a, T, N> where - T: FromPrimitive + Clone + Copy + Debug + Default, + T: Clone + Copy + Debug + Default, { /// Creates new mutable storage from vectors /// @@ -268,7 +269,7 @@ where /// /// Always sets bit depth to `0` pub fn alloc(width: usize, height: usize) -> ImageStoreMut<'a, T, N> { - let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; + let vc = vec![T::default(); width * N * height]; ImageStoreMut:: { buffer: BufferStore::Owned(vc), channels: N, @@ -285,7 +286,7 @@ where height: usize, bit_depth: usize, ) -> ImageStoreMut<'a, T, N> { - let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; + let vc = vec![T::default(); width * N * height]; ImageStoreMut:: { buffer: BufferStore::Owned(vc), channels: N, @@ -299,7 +300,7 @@ where impl ImageStoreMut<'_, T, N> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { /// Returns safe stride /// @@ -315,7 +316,7 @@ where impl ImageStore<'_, T, N> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { /// Returns safe stride /// @@ -331,7 +332,7 @@ where impl<'a, T, const N: usize> ImageStore<'a, T, N> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { /// Returns bounded image size pub fn get_size(&self) -> ImageSize { @@ -398,7 +399,7 @@ where impl<'a, T, const N: usize> ImageStoreMut<'a, T, N> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { /// Returns bounded image size pub fn get_size(&self) -> ImageSize { @@ -460,12 +461,12 @@ where } } -pub(crate) trait AssociateAlpha { +pub(crate) trait AssociateAlpha { fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, T, N>, pool: &Option); fn is_alpha_premultiplication_needed(&self) -> bool; } -pub(crate) trait UnassociateAlpha { +pub(crate) trait UnassociateAlpha { fn unpremultiply_alpha(&mut self, pool: &Option); } @@ -615,13 +616,9 @@ impl AssociateAlpha for ImageStore<'_, f32, 4> { } } -#[cfg(feature = "half")] -impl AssociateAlpha for ImageStore<'_, half::f16, 4> { - fn premultiply_alpha( - &self, - into: &mut ImageStoreMut<'_, half::f16, 4>, - pool: &Option, - ) { +#[cfg(feature = "nightly_f16")] +impl AssociateAlpha for ImageStore<'_, f16, 4> { + fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, f16, 4>, pool: &Option) { let src_stride = self.stride(); let dst_stride = into.stride(); let dst = into.buffer.borrow_mut(); @@ -665,8 +662,8 @@ impl UnassociateAlpha for ImageStoreMut<'_, f32, 4> { } } -#[cfg(feature = "half")] -impl UnassociateAlpha for ImageStoreMut<'_, half::f16, 4> { +#[cfg(feature = "nightly_f16")] +impl UnassociateAlpha for ImageStoreMut<'_, f16, 4> { fn unpremultiply_alpha(&mut self, pool: &Option) { let stride = self.stride(); let dst = self.buffer.borrow_mut(); @@ -692,22 +689,22 @@ pub type Rgba16ImageStoreMut<'a> = ImageStoreMut<'a, u16, 4>; pub type Rgb16ImageStore<'a> = ImageStore<'a, u16, 3>; pub type Rgb16ImageStoreMut<'a> = ImageStoreMut<'a, u16, 3>; -#[cfg(feature = "half")] -pub type PlanarF16ImageStore<'a> = ImageStore<'a, half::f16, 1>; -#[cfg(feature = "half")] -pub type PlanarF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 1>; -#[cfg(feature = "half")] -pub type CbCrF16ImageStore<'a> = ImageStore<'a, half::f16, 2>; -#[cfg(feature = "half")] -pub type CbCrF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 2>; -#[cfg(feature = "half")] -pub type RgbaF16ImageStore<'a> = ImageStore<'a, half::f16, 4>; -#[cfg(feature = "half")] -pub type RgbaF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 4>; -#[cfg(feature = "half")] -pub type RgbF16ImageStore<'a> = ImageStore<'a, half::f16, 3>; -#[cfg(feature = "half")] -pub type RgbF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 3>; +#[cfg(feature = "nightly_f16")] +pub type PlanarF16ImageStore<'a> = ImageStore<'a, f16, 1>; +#[cfg(feature = "nightly_f16")] +pub type PlanarF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 1>; +#[cfg(feature = "nightly_f16")] +pub type CbCrF16ImageStore<'a> = ImageStore<'a, f16, 2>; +#[cfg(feature = "nightly_f16")] +pub type CbCrF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 2>; +#[cfg(feature = "nightly_f16")] +pub type RgbaF16ImageStore<'a> = ImageStore<'a, f16, 4>; +#[cfg(feature = "nightly_f16")] +pub type RgbaF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 4>; +#[cfg(feature = "nightly_f16")] +pub type RgbF16ImageStore<'a> = ImageStore<'a, f16, 3>; +#[cfg(feature = "nightly_f16")] +pub type RgbF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 3>; pub type PlanarF32ImageStore<'a> = ImageStore<'a, f32, 1>; pub type PlanarF32ImageStoreMut<'a> = ImageStoreMut<'a, f32, 1>; diff --git a/src/lib.rs b/src/lib.rs index 8ba1341..1d31842 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,13 +29,38 @@ #![deny(deprecated)] // #![deny(unreachable_code, unused)] #![allow(clippy::too_many_arguments)] -#![cfg_attr(feature = "nightly_avx512", feature(cfg_version))] -#![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))] -#![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))] -#![cfg_attr(feature = "nightly_avx512fp16", feature(stdarch_x86_avx512_f16))] +#![cfg_attr( + all( + feature = "nightly_avx512", + any(target_arch = "x86", target_arch = "x86_64") + ), + feature(cfg_version) +)] +#![cfg_attr( + all( + feature = "nightly_avx512", + any(target_arch = "x86", target_arch = "x86_64") + ), + feature(avx512_target_feature) +)] +#![cfg_attr( + all( + feature = "nightly_avx512", + any(target_arch = "x86", target_arch = "x86_64") + ), + feature(stdarch_x86_avx512) +)] +#![cfg_attr( + all( + feature = "nightly_avx512fp16", + any(target_arch = "x86", target_arch = "x86_64") + ), + feature(stdarch_x86_avx512_f16) +)] +#![cfg_attr(feature = "nightly_f16", feature(f16))] mod alpha_check; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod alpha_handle_f16; mod alpha_handle_f32; mod alpha_handle_u16; @@ -58,12 +83,12 @@ mod convolution; mod convolve_naive_f32; mod cpu_features; mod dispatch_group_ar30; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod dispatch_group_f16; mod dispatch_group_f32; mod dispatch_group_u16; mod dispatch_group_u8; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod f16; mod filter_weights; mod fixed_point_horizontal; @@ -95,7 +120,7 @@ mod rgba_u8; mod sampler; mod saturate_narrow; mod scaler; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod scaler_f16; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] mod sse; @@ -118,7 +143,7 @@ pub use image_store::{ RgbF32ImageStore, RgbF32ImageStoreMut, Rgba16ImageStore, Rgba16ImageStoreMut, Rgba8ImageStore, Rgba8ImageStoreMut, RgbaF32ImageStore, RgbaF32ImageStoreMut, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub use image_store::{ CbCrF16ImageStore, CbCrF16ImageStoreMut, PlanarF16ImageStore, PlanarF16ImageStoreMut, RgbF16ImageStore, RgbF16ImageStoreMut, RgbaF16ImageStore, RgbaF16ImageStoreMut, @@ -129,5 +154,5 @@ pub use sampler::*; pub use scaler::Scaling; pub use scaler::ScalingF32; pub use scaler::ScalingU16; -pub use scaler::{ImageStoreScaling, Scaler, ScalingOptions}; +pub use scaler::{ImageStoreScaling, Scaler, ScalingOptions, WorkloadStrategy}; pub use threading_policy::*; diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs index 1cc63ba..3c14ccc 100644 --- a/src/mixed_storage.rs +++ b/src/mixed_storage.rs @@ -55,12 +55,15 @@ impl MixedStorage for f32 { } } -#[cfg(feature = "half")] -impl MixedStorage for f32 { +#[cfg(feature = "nightly_f16")] +use core::f16; + +#[cfg(feature = "nightly_f16")] +impl MixedStorage for f32 { #[inline(always)] #[allow(clippy::manual_clamp)] - fn to_mixed(self, _: u32) -> half::f16 { - half::f16::from_f32(self) + fn to_mixed(self, _: u32) -> f16 { + self as f16 } } diff --git a/src/neon/alpha_f16.rs b/src/neon/alpha_f16.rs index eac8f01..40adf23 100644 --- a/src/neon/alpha_f16.rs +++ b/src/neon/alpha_f16.rs @@ -31,11 +31,12 @@ use std::arch::aarch64::*; use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row}; use crate::neon::f16_utils::*; +use core::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; -unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half::f16]) { +unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [f16], src: &[f16]) { let mut rem = dst; let mut src_rem = src; @@ -89,9 +90,9 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half } pub(crate) fn neon_premultiply_alpha_rgba_f16( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, _: usize, @@ -114,7 +115,7 @@ pub(crate) fn neon_premultiply_alpha_rgba_f16( } } -unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) { +unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [f16]) { let mut rem = in_place; for dst in rem.chunks_exact_mut(8 * 4) { @@ -182,7 +183,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) { } pub(crate) fn neon_unpremultiply_alpha_rgba_f16( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, _: usize, diff --git a/src/neon/alpha_f16_full.rs b/src/neon/alpha_f16_full.rs index 633f7d7..29eb0ad 100644 --- a/src/neon/alpha_f16_full.rs +++ b/src/neon/alpha_f16_full.rs @@ -31,12 +31,13 @@ use std::arch::aarch64::*; use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row}; use crate::neon::f16_utils::*; +use core::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; #[target_feature(enable = "fp16")] -unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: &[half::f16]) { +unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [f16], src: &[f16]) { let mut rem = dst; let mut src_rem = src; @@ -66,9 +67,9 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: & } pub(crate) fn neon_premultiply_alpha_rgba_f16_full( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, _: usize, @@ -95,7 +96,7 @@ pub(crate) fn neon_premultiply_alpha_rgba_f16_full( } #[target_feature(enable = "fp16")] -unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16]) { +unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [f16]) { let mut rem = in_place; for dst in rem.chunks_exact_mut(8 * 4) { @@ -137,7 +138,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16]) } pub(crate) fn neon_unpremultiply_alpha_rgba_f16_full( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, _: usize, diff --git a/src/neon/ar30.rs b/src/neon/ar30.rs index d846e11..db16224 100644 --- a/src/neon/ar30.rs +++ b/src/neon/ar30.rs @@ -95,9 +95,9 @@ pub(crate) unsafe fn vunzips_4_ar30( +pub(crate) unsafe fn vunzip_3_ar30( v: uint32x4x2_t, -) -> int16x8x4_t { +) -> int16x8x3_t { let mask = vdupq_n_u32(0x3ff); let ar_type: Rgb30 = AR30_TYPE.into(); @@ -121,45 +121,13 @@ pub(crate) unsafe fn vunzip_4_ar30(v.0), mask)), vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)), ); - let va = vcombine_u16( - vmovn_u32(vshrq_n_u32::<30>(v.0)), - vmovn_u32(vshrq_n_u32::<30>(v.1)), - ); - let a = vorrq_u16( - vorrq_u16( - vorrq_u16( - vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)), - vshlq_n_u16::<4>(va), - ), - vshlq_n_u16::<2>(va), - ), - va, - ); - int16x8x4_t( + int16x8x3_t( vreinterpretq_s16_u16(r), vreinterpretq_s16_u16(g), vreinterpretq_s16_u16(b), - vreinterpretq_s16_u16(a), ) } Rgb30::Ra30 => { - let a_mask = vdupq_n_u32(0x3); - let va = vcombine_u16( - vmovn_u32(vandq_u32(v.0, a_mask)), - vmovn_u32(vandq_u32(v.1, a_mask)), - ); - - let a = vorrq_u16( - vorrq_u16( - vorrq_u16( - vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)), - vshlq_n_u16::<4>(va), - ), - vshlq_n_u16::<2>(va), - ), - va, - ); - let r = vcombine_u16( vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)), vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)), @@ -172,23 +140,110 @@ pub(crate) unsafe fn vunzip_4_ar30(v.0), mask)), vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)), ); - int16x8x4_t( + int16x8x3_t( vreinterpretq_s16_u16(r), vreinterpretq_s16_u16(g), vreinterpretq_s16_u16(b), - vreinterpretq_s16_u16(a), ) } } } +// #[inline(always)] +// pub(crate) unsafe fn vunzip_4_ar30( +// v: uint32x4x2_t, +// ) -> int16x8x4_t { +// let mask = vdupq_n_u32(0x3ff); +// let ar_type: Rgb30 = AR30_TYPE.into(); +// +// let v = if AR30_ORDER == 0 { +// v +// } else { +// uint32x4x2_t(vrev128_u32(v.0), vrev128_u32(v.1)) +// }; +// +// match ar_type { +// Rgb30::Ar30 => { +// let r = vcombine_u16( +// vmovn_u32(vandq_u32(v.0, mask)), +// vmovn_u32(vandq_u32(v.1, mask)), +// ); +// let g = vcombine_u16( +// vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.0), mask)), +// vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.1), mask)), +// ); +// let b = vcombine_u16( +// vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.0), mask)), +// vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)), +// ); +// let va = vcombine_u16( +// vmovn_u32(vshrq_n_u32::<30>(v.0)), +// vmovn_u32(vshrq_n_u32::<30>(v.1)), +// ); +// let a = vorrq_u16( +// vorrq_u16( +// vorrq_u16( +// vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)), +// vshlq_n_u16::<4>(va), +// ), +// vshlq_n_u16::<2>(va), +// ), +// va, +// ); +// int16x8x4_t( +// vreinterpretq_s16_u16(r), +// vreinterpretq_s16_u16(g), +// vreinterpretq_s16_u16(b), +// vreinterpretq_s16_u16(a), +// ) +// } +// Rgb30::Ra30 => { +// let a_mask = vdupq_n_u32(0x3); +// let va = vcombine_u16( +// vmovn_u32(vandq_u32(v.0, a_mask)), +// vmovn_u32(vandq_u32(v.1, a_mask)), +// ); +// +// let a = vorrq_u16( +// vorrq_u16( +// vorrq_u16( +// vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)), +// vshlq_n_u16::<4>(va), +// ), +// vshlq_n_u16::<2>(va), +// ), +// va, +// ); +// +// let r = vcombine_u16( +// vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)), +// vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)), +// ); +// let g = vcombine_u16( +// vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.0), mask)), +// vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.1), mask)), +// ); +// let b = vcombine_u16( +// vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.0), mask)), +// vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)), +// ); +// int16x8x4_t( +// vreinterpretq_s16_u16(r), +// vreinterpretq_s16_u16(g), +// vreinterpretq_s16_u16(b), +// vreinterpretq_s16_u16(a), +// ) +// } +// } +// } + #[inline(always)] -pub(crate) unsafe fn vunzip_4_ar30_separate( +pub(crate) unsafe fn vunzip_3_ar30_separate( v: uint32x4x2_t, ) -> int16x8x4_t { - let values = vunzip_4_ar30::(v); + let values = vunzip_3_ar30::(v); let a0 = vtrnq_s16(values.0, values.1); - let a1 = vtrnq_s16(values.2, values.3); + let a1 = vtrnq_s16(values.2, vdupq_n_s16(3)); let v1 = vtrnq_s32(vreinterpretq_s32_s16(a0.0), vreinterpretq_s32_s16(a1.0)); let v2 = vtrnq_s32(vreinterpretq_s32_s16(a0.1), vreinterpretq_s32_s16(a1.1)); let k0 = vreinterpretq_s16_s32(v1.0); @@ -219,12 +274,12 @@ pub(crate) unsafe fn vzip_4_ar30 uint32x4x2_t { let ar_type: Rgb30 = AR30_TYPE.into(); - let a_max = vdupq_n_s16(3); + // let a_max = vdupq_n_s16(3); match ar_type { Rgb30::Ar30 => { - let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max); - let mut a0 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)))); - let mut a1 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)))); + // let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max); + let mut a0 = vdupq_n_u32(3 << 30); //vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)))); + let mut a1 = vdupq_n_u32(3 << 30); // vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)))); let r0 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2)))); let r1 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2)))); @@ -248,9 +303,9 @@ pub(crate) unsafe fn vzip_4_ar30 { - let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max); - let mut a0 = vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))); - let mut a1 = vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))); + // let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max); + let mut a0 = vdupq_n_u32(3); //vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))); + let mut a1 = vdupq_n_u32(3); //vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))); let r0 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0)))); let r1 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0)))); @@ -284,9 +339,14 @@ pub(crate) unsafe fn vzip_4_ar30( - arr: &[u32], + arr: &[u8], ) -> int16x4_t { - let item = *arr.get_unchecked(0); + let item = u32::from_ne_bytes([ + *arr.get_unchecked(0), + *arr.get_unchecked(1), + *arr.get_unchecked(2), + *arr.get_unchecked(3), + ]); let ar_type: Rgb30 = AR30_TYPE.into(); let vl = ar_type.unpack::(item); let a_rep = (vl.3 as i16) << 8; diff --git a/src/neon/convolve_f16.rs b/src/neon/convolve_f16.rs index 8d0ada8..1bd57e7 100644 --- a/src/neon/convolve_f16.rs +++ b/src/neon/convolve_f16.rs @@ -31,14 +31,15 @@ use std::arch::aarch64::{vdupq_n_f32, vld1q_dup_f32}; use crate::filter_weights::FilterBounds; use crate::neon::utils::prefer_vfmaq_f32; use crate::neon::*; +use core::f16; #[inline(always)] pub(crate) unsafe fn convolve_vertical_part_neon_8_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, blend_length: usize, @@ -56,7 +57,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16 let s_ptr = src_ptr.add(px); let item_row = if USE_BLENDING { - let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8]; + let mut transient: [f16; 8] = [0.; 8]; std::ptr::copy_nonoverlapping(s_ptr, transient.as_mut_ptr(), blend_length); xvldq_f16(transient.as_ptr()) } else { @@ -74,7 +75,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16 let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); if USE_BLENDING { - let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8]; + let mut transient: [f16; 8] = [0.; 8]; xvstq_f16(transient.as_mut_ptr(), item); std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, blend_length); } else { diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs index 8644646..de8a103 100644 --- a/src/neon/f16_utils.rs +++ b/src/neon/f16_utils.rs @@ -28,6 +28,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#[cfg(feature = "nightly_f16")] +use core::f16; use std::arch::aarch64::*; use std::arch::asm; @@ -89,22 +91,22 @@ pub(crate) struct x_float16x8x4_t( ); #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvld_f16(ptr: *const f16) -> x_float16x4_t { let store: uint16x4_t = vld1_u16(ptr as *const _); std::mem::transmute(store) } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvldq_f16(ptr: *const f16) -> x_float16x8_t { let store: uint16x8_t = vld1q_u16(ptr as *const _); std::mem::transmute(store) } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvldq_f16_x2(ptr: *const f16) -> x_float16x8x2_t { let ptr_u16 = ptr as *const u16; x_float16x8x2_t( xreinterpretq_f16_u16(vld1q_u16(ptr_u16)), @@ -113,8 +115,8 @@ pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvldq_f16_x4(ptr: *const f16) -> x_float16x8x4_t { let ptr_u16 = ptr as *const u16; x_float16x8x4_t( xreinterpretq_f16_u16(vld1q_u16(ptr_u16)), @@ -362,6 +364,330 @@ pub(super) unsafe fn xvfmla_f16( xreinterpret_f16_u16(result) } +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_high_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_high_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x8_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.4h", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + result +} + +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_low_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_low_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x8_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.4h", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + result +} + +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_lane_low_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_lane_low_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x4_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + static_assert_uimm_bits!(LANE, 3); + let full_lane = xvcombine_f16(c, c); + if LANE == 0 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } + result +} + +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_laneq_low_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_laneq_low_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x8_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + static_assert_uimm_bits!(LANE, 3); + if LANE == 0 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 4 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[4]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 5 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[5]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 6 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[6]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 7 { + asm!( + "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[7]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } + result +} + +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/xvfmlalq_lane_high_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_lane_high_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x4_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + static_assert_uimm_bits!(LANE, 3); + let full_lane = xvcombine_f16(c, c); + if LANE == 0 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(full_lane), + options(pure, nomem, nostack) + ); + } + result +} + +/// Floating-point fused Multiply-Add Long to accumulator (vector). +/// This instruction multiplies corresponding half-precision floating-point values +/// in the vectors in the two source SIMD&FP registers, and accumulates the product +/// to the corresponding vector element of the destination SIMD&FP register. +/// The instruction does not round the result of the multiply before the accumulation. +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/xvfmlalq_laneq_high_f16) +#[target_feature(enable = "fhm")] +#[inline] +pub(super) unsafe fn xvfmlalq_laneq_high_f16( + a: float32x4_t, + b: x_float16x8_t, + c: x_float16x8_t, +) -> float32x4_t { + let mut result: float32x4_t = a; + static_assert_uimm_bits!(LANE, 3); + if LANE == 0 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 4 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[4]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 5 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[5]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 6 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[6]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 7 { + asm!( + "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[7]", + inout(vreg) result, + in(vreg) xreinterpretq_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } + result +} + /// Floating-point fused Multiply-Add to accumulator (vector). /// This instruction multiplies corresponding floating-point values in the vectors /// in the two source SIMD&FP registers, adds the product to the corresponding @@ -665,28 +991,28 @@ pub(super) unsafe fn xvbslq_f16( } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvst_f16(ptr: *mut f16, x: x_float16x4_t) { vst1_u16(ptr as *mut u16, xreinterpret_u16_f16(x)) } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvstq_f16(ptr: *mut f16, x: x_float16x8_t) { vst1q_u16(ptr as *mut u16, xreinterpretq_u16_f16(x)) } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut f16, x: x_float16x8x2_t) { let ptr_u16 = ptr as *mut u16; vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1)); } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvstq_f16_x4(ptr: *const f16, x: x_float16x8x4_t) { let ptr_u16 = ptr as *mut u16; vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1)); @@ -705,9 +1031,9 @@ pub(crate) unsafe fn xvdup_laneq_f16(a: x_float16x8_t) -> x_float1 } #[inline] -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) unsafe fn xvld1q_lane_f16( - ptr: *const half::f16, + ptr: *const f16, src: x_float16x8_t, ) -> x_float16x8_t { xreinterpretq_f16_u16(vld1q_lane_u16::( @@ -717,11 +1043,8 @@ pub(crate) unsafe fn xvld1q_lane_f16( } #[inline] -#[cfg(feature = "half")] -pub(crate) unsafe fn xvsetq_lane_f16( - v: half::f16, - r: x_float16x8_t, -) -> x_float16x8_t { +#[cfg(feature = "nightly_f16")] +pub(crate) unsafe fn xvsetq_lane_f16(v: f16, r: x_float16x8_t) -> x_float16x8_t { xreinterpretq_f16_u16(vsetq_lane_u16::( v.to_bits(), xreinterpretq_u16_f16(r), diff --git a/src/neon/horizontal_ar30.rs b/src/neon/horizontal_ar30_rdm.rs similarity index 88% rename from src/neon/horizontal_ar30.rs rename to src/neon/horizontal_ar30_rdm.rs index ea489ba..0aa7e99 100644 --- a/src/neon/horizontal_ar30.rs +++ b/src/neon/horizontal_ar30_rdm.rs @@ -28,7 +28,7 @@ */ use crate::filter_weights::FilterWeights; use crate::neon::ar30::{ - vextract_ar30, vld1_ar30_s16, vunzip_4_ar30_separate, vunzips_4_ar30_separate, + vextract_ar30, vld1_ar30_s16, vunzip_3_ar30_separate, vunzips_4_ar30_separate, }; use std::arch::aarch64::*; @@ -39,11 +39,11 @@ unsafe fn conv_horiz_rgba_1_u8_i16< const AR_ORDER: usize, >( start_x: usize, - src: &[u32], + src: &[u8], w0: int16x4_t, store: int16x4_t, ) -> int16x4_t { - let src_ptr = src.get_unchecked(start_x..); + let src_ptr = src.get_unchecked(start_x * 4..); let ld = vld1_ar30_s16::(src_ptr); let rgba_pixel = vshl_n_s16::(ld); vqrdmlah_s16(store, rgba_pixel, w0) @@ -56,14 +56,15 @@ unsafe fn conv_horiz_rgba_8_u8_i16< const AR_ORDER: usize, >( start_x: usize, - src: &[u32], + src: &[u8], set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t), set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t), store: int16x4_t, ) -> int16x4_t { - let src_ptr = src.get_unchecked(start_x..); + let src_ptr = src.get_unchecked(start_x * 4..); - let rgba_pixel = vunzip_4_ar30_separate::(vld1q_u32_x2(src_ptr.as_ptr())); + let rgba_pixel = + vunzip_3_ar30_separate::(vld1q_u32_x2(src_ptr.as_ptr() as *const _)); let hi0 = vshlq_n_s16::(rgba_pixel.1); let lo0 = vshlq_n_s16::(rgba_pixel.0); @@ -88,16 +89,17 @@ unsafe fn conv_horiz_rgba_4_u8_i16< const AR_ORDER: usize, >( start_x: usize, - src: &[u32], + src: &[u8], w0: int16x4_t, w1: int16x4_t, w2: int16x4_t, w3: int16x4_t, store: int16x4_t, ) -> int16x4_t { - let src_ptr = src.get_unchecked(start_x..); + let src_ptr = src.get_unchecked(start_x * 4..); - let rgba_pixel = vunzips_4_ar30_separate::(vld1q_u32(src_ptr.as_ptr())); + let rgba_pixel = + vunzips_4_ar30_separate::(vld1q_u32(src_ptr.as_ptr() as *const _)); let hi = vshlq_n_s16::(rgba_pixel.1); let lo = vshlq_n_s16::(rgba_pixel.0); @@ -115,9 +117,9 @@ pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30< const AR_TYPE: usize, const AR_ORDER: usize, >( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, ) { @@ -134,9 +136,9 @@ pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30< #[target_feature(enable = "rdm")] unsafe fn neon_convolve_horizontal_rgba_rows_4_impl( - src: &[u32], + src: &[u8], src_stride: usize, - dst: &mut [u32], + dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, ) { @@ -153,10 +155,10 @@ unsafe fn neon_convolve_horizontal_rgba_rows_4_impl(store_16_0); - *chunk0 = packed0; - let packed1 = vextract_ar30::(store_16_1); - *chunk1 = packed1; - let packed2 = vextract_ar30::(store_16_2); - *chunk2 = packed2; - let packed3 = vextract_ar30::(store_16_3); - *chunk3 = packed3; + let packed0 = vextract_ar30::(store_16_0).to_ne_bytes(); + chunk0[0] = packed0[0]; + chunk0[1] = packed0[1]; + chunk0[2] = packed0[2]; + chunk0[3] = packed0[3]; + let packed1 = vextract_ar30::(store_16_1).to_ne_bytes(); + chunk1[0] = packed1[0]; + chunk1[1] = packed1[1]; + chunk1[2] = packed1[2]; + chunk1[3] = packed1[3]; + let packed2 = vextract_ar30::(store_16_2).to_ne_bytes(); + chunk2[0] = packed2[0]; + chunk2[1] = packed2[1]; + chunk2[2] = packed2[2]; + chunk2[3] = packed2[3]; + let packed3 = vextract_ar30::(store_16_3).to_ne_bytes(); + chunk3[0] = packed3[0]; + chunk3[1] = packed3[1]; + chunk3[2] = packed3[2]; + chunk3[3] = packed3[3]; } } } diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 4ccabc4..ca5f2bb 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -26,53 +26,69 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod alpha_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod alpha_f16_full; mod alpha_f32; mod alpha_u16; mod alpha_u8; +#[cfg(feature = "rdm")] mod ar30; +#[cfg(feature = "rdm")] mod cbcr8_rdm; mod check_alpha; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod convolve_f16; mod f16_utils; -mod horizontal_ar30; +#[cfg(feature = "rdm")] +mod horizontal_ar30_rdm; mod plane_f32; mod plane_u8; +#[cfg(feature = "rdm")] mod plane_u8_rdm; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod rgb_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +mod rgb_f16_fhm; +#[cfg(feature = "nightly_f16")] mod rgb_f16_full; mod rgb_f32; mod rgb_u8; +#[cfg(feature = "rdm")] mod rgb_u8_sqrdml; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod rgba_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +mod rgba_f16_fhm; +#[cfg(feature = "nightly_f16")] mod rgba_f16_full; mod rgba_f32; mod rgba_u16_lb; mod rgba_u8; +#[cfg(feature = "rdm")] +mod rgba_u8_rdm; mod utils; -mod vertical_ar30; -#[cfg(feature = "half")] +#[cfg(feature = "rdm")] +mod vertical_ar30_rdm; +#[cfg(feature = "nightly_f16")] mod vertical_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +mod vertical_f16_fhm; +#[cfg(feature = "nightly_f16")] mod vertical_f16_full; mod vertical_f32; mod vertical_u16; mod vertical_u16_lb; mod vertical_u16_lb_f16; mod vertical_u8; +#[cfg(feature = "rdm")] +mod vertical_u8_rdm; mod weights; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16}; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use alpha_f16_full::{ neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full, }; @@ -81,6 +97,7 @@ pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32; pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16}; pub(crate) use alpha_u8::neon_premultiply_alpha_rgba; pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba; +#[cfg(feature = "rdm")] pub(crate) use cbcr8_rdm::{ convolve_horizontal_cbcr_neon_rdm_row, convolve_horizontal_cbcr_neon_rows_rdm_4_u8, }; @@ -88,18 +105,27 @@ pub(crate) use check_alpha::{ neon_has_non_constant_cap_alpha_rgba16, neon_has_non_constant_cap_alpha_rgba8, }; pub(crate) use f16_utils::*; -pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30; +#[cfg(feature = "rdm")] +pub(crate) use horizontal_ar30_rdm::neon_convolve_horizontal_rgba_rows_4_ar30; pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one; pub(crate) use plane_f32::convolve_horizontal_plane_neon_rows_4; -pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8}; +pub use plane_u8::{ + convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_row_q, + convolve_horizontal_plane_neon_rows_4_u8, convolve_horizontal_plane_neon_rows_4_u8_q, +}; +#[cfg(feature = "rdm")] pub(crate) use plane_u8_rdm::{ convolve_horizontal_plane_neon_rdm_row, convolve_horizontal_plane_neon_rows_rdm_4_u8, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgb_f16::{ convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +pub(crate) use rgb_f16_fhm::{ + convolve_horizontal_rgb_neon_row_one_f16_fhm, convolve_horizontal_rgb_neon_rows_4_f16_fhm, +}; +#[cfg(feature = "nightly_f16")] pub(crate) use rgb_f16_full::{ xconvolve_horizontal_rgb_neon_row_one_f16, xconvolve_horizontal_rgb_neon_rows_4_f16, }; @@ -107,16 +133,22 @@ pub(crate) use rgb_f32::{ convolve_horizontal_rgb_neon_row_one_f32, convolve_horizontal_rgb_neon_rows_4_f32, }; pub(crate) use rgb_u8::{ - convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_rows_4, + convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_row_one_q, + convolve_horizontal_rgb_neon_rows_4, convolve_horizontal_rgb_neon_rows_4_q, }; +#[cfg(feature = "rdm")] pub(crate) use rgb_u8_sqrdml::{ convolve_horizontal_rgb_neon_rdm_row_one, convolve_horizontal_rgb_neon_rdm_rows_4, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +pub(crate) use rgba_f16_fhm::{ + convolve_horizontal_rgba_neon_row_one_f16_fhm, convolve_horizontal_rgba_neon_rows_4_f16_fhm, +}; +#[cfg(feature = "nightly_f16")] pub(crate) use rgba_f16_full::{ xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16, }; @@ -127,19 +159,30 @@ pub(crate) use rgba_u16_lb::{ convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row, }; pub(crate) use rgba_u8::{ - convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_i16, - convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_i16, + convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_q, + convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_q, +}; +#[cfg(feature = "rdm")] +pub(crate) use rgba_u8_rdm::{ + convolve_horizontal_rgba_neon_row_i16, convolve_horizontal_rgba_neon_rows_4_u8_i16, }; -pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30; -#[cfg(feature = "half")] +#[cfg(feature = "rdm")] +pub(crate) use vertical_ar30_rdm::neon_column_handler_fixed_point_ar30; +#[cfg(feature = "nightly_f16")] pub(crate) use vertical_f16::convolve_vertical_rgb_neon_row_f16; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] +pub(crate) use vertical_f16_fhm::convolve_vertical_rgb_neon_row_f16_fhm; +#[cfg(feature = "nightly_f16")] pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16; pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32; pub(crate) use vertical_u16::convolve_column_u16; pub(crate) use vertical_u16_lb::convolve_column_lb_u16; pub(crate) use vertical_u16_lb_f16::convolve_column_lb_u16_f16; pub(crate) use vertical_u8::{ - convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision, + convolve_vertical_neon_i32_precision, convolve_vertical_neon_i32_precision_d, }; +#[cfg(feature = "rdm")] +pub(crate) use vertical_u8_rdm::convolve_vertical_neon_i16_precision; pub(crate) use weights::convert_weights_to_f16; +#[cfg(feature = "nightly_f16")] +pub(crate) use weights::convert_weights_to_f16_fhm; diff --git a/src/neon/plane_u8.rs b/src/neon/plane_u8.rs index 9dcde44..ccccb3a 100644 --- a/src/neon/plane_u8.rs +++ b/src/neon/plane_u8.rs @@ -27,55 +27,102 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::filter_weights::FilterWeights; -use crate::neon::utils::xvld1q_s16_x2; -use crate::support::{PRECISION, ROUNDING_CONST}; +use crate::neon::utils::{vxmlal_high_s16, vxmlal_s16, xvld1q_s16_x2}; +use crate::support::PRECISION; use std::arch::aarch64::*; -macro_rules! accumulate_16_horiz { - ($store: expr, $ptr: expr, $weights: expr) => {{ - let pixel_colors = vld1q_u8($ptr); - let px_high_16 = vreinterpretq_s16_u16(vmovl_high_u8(pixel_colors)); - let px_low_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixel_colors))); - - $store = vmlal_high_s16($store, px_high_16, $weights.1); - $store = vmlal_s16($store, vget_low_s16(px_high_16), vget_low_s16($weights.1)); - - $store = vmlal_high_s16($store, px_low_16, $weights.0); - $store = vmlal_s16($store, vget_low_s16(px_low_16), vget_low_s16($weights.0)); - }}; +#[must_use] +#[inline(always)] +unsafe fn accumulate_16_horiz( + store: int32x4_t, + ptr: &[u8], + weights: int16x8x2_t, +) -> int32x4_t { + let pixel_colors = vld1q_u8(ptr.as_ptr()); + let px_high_16 = vreinterpretq_s16_u16(vmovl_high_u8(pixel_colors)); + let px_low_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixel_colors))); + + let mut store = vxmlal_high_s16::(store, px_high_16, weights.1); + store = vxmlal_s16::(store, vget_low_s16(px_high_16), vget_low_s16(weights.1)); + + store = vxmlal_high_s16::(store, px_low_16, weights.0); + store = vxmlal_s16::(store, vget_low_s16(px_low_16), vget_low_s16(weights.0)); + store } -macro_rules! accumulate_8_horiz { - ($store: expr, $ptr: expr, $weights: expr) => {{ - let pixel_colors = vld1_u8($ptr); - let px_16 = vreinterpretq_s16_u16(vmovl_u8(pixel_colors)); +#[must_use] +#[inline(always)] +unsafe fn accumulate_8_horiz( + store: int32x4_t, + ptr: &[u8], + weight: int16x8_t, +) -> int32x4_t { + let pixel_colors = vld1_u8(ptr.as_ptr()); + let px_16 = vreinterpretq_s16_u16(vmovl_u8(pixel_colors)); + + let mut store = vxmlal_high_s16::(store, px_16, weight); + store = vxmlal_s16::(store, vget_low_s16(px_16), vget_low_s16(weight)); + store +} - $store = vmlal_high_s16($store, px_16, $weights); - $store = vmlal_s16($store, vget_low_s16(px_16), vget_low_s16($weights)); - }}; +#[inline(always)] +unsafe fn accumulate_4_horiz( + store: int32x4_t, + ptr: &[u8], + weight: int16x4_t, +) -> int32x4_t { + let pixel_colors = vmovl_u8(vreinterpret_u8_u32(vld1_lane_u32::<0>( + ptr.as_ptr() as *const u32, + vdup_n_u32(0), + ))); + let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors)); + vxmlal_s16::(store, px_16, weight) } -macro_rules! accumulate_4_horiz { - ($store: expr, $ptr: expr, $weights: expr) => {{ - let pixel_colors = vmovl_u8(vreinterpret_u8_u32(vld1_lane_u32::<0>( - $ptr as *const u32, - vdup_n_u32(0), - ))); - let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors)); +#[inline(always)] +unsafe fn accumulate_1_horiz( + store: int32x4_t, + ptr: &[u8], + weight: int16x4_t, +) -> int32x4_t { + let pixel_colors = vmovl_u8(vld1_lane_u8::<0>(ptr.as_ptr(), vdup_n_u8(0))); + let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors)); + vxmlal_s16::(store, px_16, weight) +} - $store = vmlal_s16($store, px_16, $weights); - }}; +pub fn convolve_horizontal_plane_neon_rows_4_u8( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, +) { + convolve_horizontal_plane_neon_rows_4_u8_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } -macro_rules! accumulate_1_horiz { - ($store: expr, $ptr: expr, $weight: expr) => {{ - let pixel_colors = vld1_u16([$ptr.read_unaligned() as u16, 0u16, 0u16, 0u16].as_ptr()); - let px_16 = vreinterpret_s16_u16(pixel_colors); - $store = vmlal_s16($store, px_16, $weight); - }}; +pub fn convolve_horizontal_plane_neon_rows_4_u8_q( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, +) { + convolve_horizontal_plane_neon_rows_4_u8_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } -pub fn convolve_horizontal_plane_neon_rows_4_u8( +fn convolve_horizontal_plane_neon_rows_4_u8_impl( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -92,9 +139,11 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8( let iter_row2 = row2_ref.iter_mut(); let iter_row3 = row3_ref.iter_mut(); + let rnd_const = (1 << (PRECISION - 1)) - 1; + let base_val = { let j = vdupq_n_s32(0); - vsetq_lane_s32::<0>(ROUNDING_CONST, j) + vsetq_lane_s32::<0>(rnd_const, j) }; for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 @@ -125,16 +174,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8( let bounds_start = bounds.start + jx; let src_ptr = src0.get_unchecked(bounds_start..); - accumulate_16_horiz!(store0, src_ptr.as_ptr(), weights); + store0 = accumulate_16_horiz::(store0, src_ptr, weights); let src_ptr1 = src1.get_unchecked(bounds_start..); - accumulate_16_horiz!(store1, src_ptr1.as_ptr(), weights); + store1 = accumulate_16_horiz::(store1, src_ptr1, weights); let src_ptr2 = src2.get_unchecked(bounds_start..); - accumulate_16_horiz!(store2, src_ptr2.as_ptr(), weights); + store2 = accumulate_16_horiz::(store2, src_ptr2, weights); let src_ptr3 = src3.get_unchecked(bounds_start..); - accumulate_16_horiz!(store3, src_ptr3.as_ptr(), weights); + store3 = accumulate_16_horiz::(store3, src_ptr3, weights); jx += 16; } @@ -145,16 +194,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8( let bounds_start = bounds.start + jx; let src_ptr = src0.get_unchecked(bounds_start..); - accumulate_8_horiz!(store0, src_ptr.as_ptr(), weights); + store0 = accumulate_8_horiz::(store0, src_ptr, weights); let src_ptr1 = src1.get_unchecked(bounds_start..); - accumulate_8_horiz!(store1, src_ptr1.as_ptr(), weights); + store1 = accumulate_8_horiz::(store1, src_ptr1, weights); let src_ptr2 = src2.get_unchecked(bounds_start..); - accumulate_8_horiz!(store2, src_ptr2.as_ptr(), weights); + store2 = accumulate_8_horiz::(store2, src_ptr2, weights); let src_ptr3 = src3.get_unchecked(bounds_start..); - accumulate_8_horiz!(store3, src_ptr3.as_ptr(), weights); + store3 = accumulate_8_horiz::(store3, src_ptr3, weights); jx += 8; } @@ -165,16 +214,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8( let bounds_start = bounds.start + jx; let src_ptr = src0.get_unchecked(bounds_start..); - accumulate_4_horiz!(store0, src_ptr.as_ptr(), weights); + accumulate_4_horiz::(store0, src_ptr, weights); let src_ptr1 = src1.get_unchecked(bounds_start..); - accumulate_4_horiz!(store1, src_ptr1.as_ptr(), weights); + accumulate_4_horiz::(store1, src_ptr1, weights); let src_ptr2 = src2.get_unchecked(bounds_start..); - accumulate_4_horiz!(store2, src_ptr2.as_ptr(), weights); + accumulate_4_horiz::(store2, src_ptr2, weights); let src_ptr3 = src3.get_unchecked(bounds_start..); - accumulate_4_horiz!(store3, src_ptr3.as_ptr(), weights); + accumulate_4_horiz::(store3, src_ptr3, weights); jx += 4; } @@ -185,16 +234,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8( let bounds_start = bounds.start + jx; let src_ptr = src0.get_unchecked(bounds_start..); - accumulate_1_horiz!(store0, src_ptr.as_ptr(), weight); + accumulate_1_horiz::(store0, src_ptr, weight); let src_ptr1 = src1.get_unchecked(bounds_start..); - accumulate_1_horiz!(store1, src_ptr1.as_ptr(), weight); + accumulate_1_horiz::(store1, src_ptr1, weight); let src_ptr2 = src2.get_unchecked(bounds_start..); - accumulate_1_horiz!(store2, src_ptr2.as_ptr(), weight); + accumulate_1_horiz::(store2, src_ptr2, weight); let src_ptr3 = src3.get_unchecked(bounds_start..); - accumulate_1_horiz!(store3, src_ptr3.as_ptr(), weight); + accumulate_1_horiz::(store3, src_ptr3, weight); jx += 1; } @@ -226,11 +275,28 @@ pub fn convolve_horizontal_plane_neon_row( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, +) { + convolve_horizontal_plane_neon_row_impl::(src, dst, filter_weights); +} + +pub fn convolve_horizontal_plane_neon_row_q( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, +) { + convolve_horizontal_plane_neon_row_impl::(src, dst, filter_weights); +} + +fn convolve_horizontal_plane_neon_row_impl( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, ) { unsafe { + let rnd_const = (1 << (PRECISION - 1)) - 1; let base_val = { let j = vdupq_n_s32(0); - vsetq_lane_s32::<0>(ROUNDING_CONST, j) + vsetq_lane_s32::<0>(rnd_const, j) }; for ((dst, bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip( @@ -248,8 +314,8 @@ pub fn convolve_horizontal_plane_neon_row( let weights = xvld1q_s16_x2(w_ptr.as_ptr()); let bounds_start = bounds.start + jx; - let src_ptr = src.get_unchecked(bounds_start..).as_ptr(); - accumulate_16_horiz!(store, src_ptr, weights); + let src_ptr = src.get_unchecked(bounds_start..); + store = accumulate_16_horiz::(store, src_ptr, weights); jx += 16; } @@ -259,8 +325,8 @@ pub fn convolve_horizontal_plane_neon_row( let weights = vld1q_s16(w_ptr.as_ptr()); let bounds_start = bounds.start + jx; - let src_ptr = src.get_unchecked(bounds_start..).as_ptr(); - accumulate_8_horiz!(store, src_ptr, weights); + let src_ptr = src.get_unchecked(bounds_start..); + store = accumulate_8_horiz::(store, src_ptr, weights); jx += 8; } @@ -270,8 +336,8 @@ pub fn convolve_horizontal_plane_neon_row( let weights = vld1_s16(w_ptr.as_ptr()); let bounds_start = bounds.start + jx; - let src_ptr = src.get_unchecked(bounds_start..).as_ptr(); - accumulate_4_horiz!(store, src_ptr, weights); + let src_ptr = src.get_unchecked(bounds_start..); + accumulate_4_horiz::(store, src_ptr, weights); jx += 4; } @@ -280,8 +346,8 @@ pub fn convolve_horizontal_plane_neon_row( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let weight = vld1_lane_s16::<0>(w_ptr.as_ptr(), vdup_n_s16(0)); let bounds_start = bounds.start + jx; - let src_ptr = src.get_unchecked(bounds_start..).as_ptr(); - accumulate_1_horiz!(store, src_ptr, weight); + let src_ptr = src.get_unchecked(bounds_start..); + accumulate_1_horiz::(store, src_ptr, weight); jx += 1; } diff --git a/src/neon/rgb_f16.rs b/src/neon/rgb_f16.rs index 3078cbf..a2622c0 100644 --- a/src/neon/rgb_f16.rs +++ b/src/neon/rgb_f16.rs @@ -27,105 +27,115 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use std::arch::aarch64::*; - use crate::filter_weights::FilterWeights; use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32}; use crate::neon::*; +use core::f16; +use std::arch::aarch64::*; -macro_rules! write_rgb_f16 { - ($store: expr, $dest_ptr: expr) => {{ - let cvt = xreinterpret_u16_f16(xvcvt_f16_f32($store)); - let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt)); - let l3 = vget_lane_u16::<2>(cvt); - ($dest_ptr as *mut u32).write_unaligned(l1); - ($dest_ptr as *mut u16).add(2).write_unaligned(l3); - }}; +#[inline(always)] +unsafe fn write_rgb_f16(store: float32x4_t, dest_ptr: &mut [f16]) { + let cvt = xreinterpret_u16_f16(xvcvt_f16_f32(store)); + let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt)); + let l3 = vget_lane_u16::<2>(cvt); + (dest_ptr.as_mut_ptr() as *mut u32).write_unaligned(l1); + (dest_ptr.as_mut_ptr() as *mut u16) + .add(2) + .write_unaligned(l3); } -macro_rules! conv_horiz_4_rgb_f16 { - ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); - - let rgb_pixel_s = xvldq_f16_x2(src_ptr); - let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)); - let rgb_first = xreinterpret_f16_u16(rgb_first_u); - let rgb_second_u = vext_u16::<3>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - ); - let rgb_second = xreinterpret_f16_u16(rgb_second_u); - - let rgb_third_u = vext_u16::<2>( - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_third = xreinterpret_f16_u16(rgb_third_u); - - let rgb_fourth_u = vext_u16::<1>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); - - let acc = prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(rgb_first), $weights); - let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $weights); - let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), $weights); - let acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), $weights); - acc - }}; +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_4_rgb_f16( + start_x: usize, + src: &[f16], + w: float32x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel_s = xvldq_f16(src_ptr as *const _); + let rgb_pixel_n = xvld_f16(src_ptr.add(8) as *const _); + + let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)); + let rgb_first = xreinterpret_f16_u16(rgb_first_u); + let rgb_second_u = vext_u16::<3>( + vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + ); + let rgb_second = xreinterpret_f16_u16(rgb_second_u); + + let rgb_third_u = vext_u16::<2>( + vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + xreinterpret_u16_f16(rgb_pixel_n), + ); + let rgb_third = xreinterpret_f16_u16(rgb_third_u); + + let rgb_fourth_u = vext_u16::<1>( + xreinterpret_u16_f16(rgb_pixel_n), + xreinterpret_u16_f16(rgb_pixel_n), + ); + let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); + + let acc = prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(rgb_first), w); + let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), w); + let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), w); + prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), w) } -macro_rules! conv_horiz_2_rgb_f16 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); - - let rgb_pixel = xvld_f16(src_ptr); - let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>( - src_ptr.add(4) as *const u32, - vdup_n_u32(0), - )); - - let rgb_first_u = xreinterpret_u16_f16(rgb_pixel); - let rgb_first = xreinterpret_f16_u16(rgb_first_u); - let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px); - let rgb_second = xreinterpret_f16_u16(rgb_second_u); - - let acc = prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(rgb_first), $set); - let acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $set); - acc - }}; +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_2_rgb_f16( + start_x: usize, + src: &[f16], + w: float32x2_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel = xvld_f16(src_ptr); + let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>( + src_ptr.add(4) as *const u32, + vdup_n_u32(0), + )); + + let rgb_first_u = xreinterpret_u16_f16(rgb_pixel); + let rgb_first = xreinterpret_f16_u16(rgb_first_u); + let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px); + let rgb_second = xreinterpret_f16_u16(rgb_second_u); + + let acc = prefer_vfmaq_lane_f32::<0>(store, xvcvt_f32_f16(rgb_first), w); + prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), w) } -macro_rules! conv_horiz_1_rgb_f16 { - ($start_x: expr, $src: expr, $weight: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_1_rgb_f16( + start_x: usize, + src: &[f16], + w: float32x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); - const ZEROS_F16: half::f16 = half::f16::from_bits(0); + let mut fq = vreinterpret_u16_u32(vld1_lane_u32::<0>(src_ptr as *const _, vdup_n_u32(0))); + fq = vld1_lane_u16::<2>(src_ptr.add(2) as *const _, fq); - let transient: [half::f16; 4] = [ - src_ptr.read_unaligned(), - src_ptr.add(1).read_unaligned(), - src_ptr.add(2).read_unaligned(), - ZEROS_F16, - ]; - let rgb_pixel = xvld_f16(transient.as_ptr()); + let rgb_pixel = xreinterpret_f16_u16(fq); - let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_pixel), $weight); - acc - }}; + prefer_vfmaq_f32(store, xvcvt_f32_f16(rgb_pixel), w) } pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( dst_width: usize, - src_width: usize, + _: usize, filter_weights: &FilterWeights, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -144,17 +154,17 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( let mut store_2 = zeros; let mut store_3 = zeros; - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + while jx + 4 < bounds.size { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); - let s_ptr1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1); - let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3); + store_0 = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store_0); + let s_ptr1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_4_rgb_f16(bounds_start, s_ptr1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_4_rgb_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_4_rgb_f16(bounds_start, s_ptr, read_weights, store_3); jx += 4; } @@ -162,13 +172,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); - let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3); + store_0 = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_2_rgb_f16(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_2_rgb_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_2_rgb_f16(bounds_start, s_ptr3, read_weights, store_3); jx += 2; } @@ -176,28 +186,28 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let bounds_start = bounds.start + jx; let weight0 = vld1q_dup_f32(ptr); - store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0); - let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1); - let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2); - let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3); + store_0 = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_1, weight0, store_1); + let s_ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_2, weight0, store_2); + let s_ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - write_rgb_f16!(store_0, dest_ptr); + let dest_ptr = dst.get_unchecked_mut(px..); + write_rgb_f16(store_0, dest_ptr); - let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_ptr(); - write_rgb_f16!(store_1, dest_ptr_1); + let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..); + write_rgb_f16(store_1, dest_ptr_1); - let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); - write_rgb_f16!(store_2, dest_ptr_2); + let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..); + write_rgb_f16(store_2, dest_ptr_2); - let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); - write_rgb_f16!(store_3, dest_ptr_3); + let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..); + write_rgb_f16(store_3, dest_ptr_3); filter_offset += filter_weights.aligned_size; } @@ -206,10 +216,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16( dst_width: usize, - src_width: usize, + _: usize, filter_weights: &FilterWeights, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], ) { unsafe { const CHANNELS: usize = 3; @@ -221,11 +231,11 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16( let mut jx = 0usize; let mut store = vdupq_n_f32(0f32); - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + while jx + 4 < bounds.size { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store); jx += 4; } @@ -233,7 +243,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store); jx += 2; } @@ -241,13 +251,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store); + store = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - write_rgb_f16!(store, dest_ptr); + let dest_ptr = dst.get_unchecked_mut(px..); + write_rgb_f16(store, dest_ptr); filter_offset += filter_weights.aligned_size; } diff --git a/src/neon/rgb_f16_fhm.rs b/src/neon/rgb_f16_fhm.rs new file mode 100644 index 0000000..fc2fe0a --- /dev/null +++ b/src/neon/rgb_f16_fhm.rs @@ -0,0 +1,301 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::filter_weights::FilterWeights; +use crate::neon::*; +use core::f16; +use std::arch::aarch64::*; + +#[inline(always)] +unsafe fn write_rgb_f16(store: float32x4_t, dest_ptr: &mut [f16]) { + let cvt = xreinterpret_u16_f16(xvcvt_f16_f32(store)); + let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt)); + let l3 = vget_lane_u16::<2>(cvt); + (dest_ptr.as_mut_ptr() as *mut u32).write_unaligned(l1); + (dest_ptr.as_mut_ptr() as *mut u16) + .add(2) + .write_unaligned(l3); +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_4_rgb_f16( + start_x: usize, + src: &[f16], + w: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel_s = xvldq_f16(src_ptr as *const _); + let rgb_pixel_n = xvld_f16(src_ptr.add(8) as *const _); + + let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)); + let rgb_first = xreinterpret_f16_u16(rgb_first_u); + let rgb_second_u = vext_u16::<3>( + vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + ); + let rgb_second = xreinterpret_f16_u16(rgb_second_u); + + let rgb_third_u = vext_u16::<2>( + vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)), + xreinterpret_u16_f16(rgb_pixel_n), + ); + let rgb_third = xreinterpret_f16_u16(rgb_third_u); + + let rgb_fourth_u = vext_u16::<1>( + xreinterpret_u16_f16(rgb_pixel_n), + xreinterpret_u16_f16(rgb_pixel_n), + ); + let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); + + let f0 = xvcombine_f16(rgb_first, rgb_second); + let f1 = xvcombine_f16(rgb_third, rgb_fourth); + + let acc = xvfmlalq_lane_low_f16::<0>(store, f0, w); + let acc = xvfmlalq_lane_high_f16::<1>(acc, f0, w); + let acc = xvfmlalq_lane_low_f16::<2>(acc, f1, w); + xvfmlalq_lane_high_f16::<3>(acc, f1, w) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_2_rgb_f16( + start_x: usize, + src: &[f16], + w: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel = xvld_f16(src_ptr); + let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>( + src_ptr.add(4) as *const u32, + vdup_n_u32(0), + )); + + let rgb_first_u = xreinterpret_u16_f16(rgb_pixel); + let rgb_first = xreinterpret_f16_u16(rgb_first_u); + let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px); + let rgb_second = xreinterpret_f16_u16(rgb_second_u); + + let f0 = xvcombine_f16(rgb_first, rgb_second); + + let acc = xvfmlalq_lane_low_f16::<0>(store, f0, w); + xvfmlalq_lane_high_f16::<1>(acc, f0, w) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_1_rgb_f16( + start_x: usize, + src: &[f16], + w: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 3; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let mut fq = vreinterpret_u16_u32(vld1_lane_u32::<0>(src_ptr as *const _, vdup_n_u32(0))); + fq = vld1_lane_u16::<2>(src_ptr.add(2) as *const _, fq); + + let rgb_pixel = xreinterpret_f16_u16(fq); + + xvfmlalq_lane_low_f16::<0>(store, xvcombine_f16(rgb_pixel, rgb_pixel), w) +} + +pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16_fhm( + dst_width: usize, + w: usize, + filter_weights: &FilterWeights, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + dst_stride: usize, +) { + unsafe { + convolve_horizontal_rgb_neon_rows_4_f16_impl( + dst_width, + w, + filter_weights, + src, + src_stride, + dst, + dst_stride, + ) + } +} + +#[target_feature(enable = "fhm")] +unsafe fn convolve_horizontal_rgb_neon_rows_4_f16_impl( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + dst_stride: usize, +) { + const CHANNELS: usize = 3; + let mut filter_offset = 0usize; + + let zeros = vdupq_n_f32(0.); + + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 4 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = xvld_f16(ptr); + store_0 = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store_0); + let s_ptr1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_4_rgb_f16(bounds_start, s_ptr1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_4_rgb_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_4_rgb_f16(bounds_start, s_ptr, read_weights, store_3); + jx += 4; + } + + while jx + 2 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = + xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _))); + store_0 = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_2_rgb_f16(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_2_rgb_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_2_rgb_f16(bounds_start, s_ptr3, read_weights, store_3); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _)); + store_0 = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_1, weight0, store_1); + let s_ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_2, weight0, store_2); + let s_ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_3, weight0, store_3); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..); + write_rgb_f16(store_0, dest_ptr); + + let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..); + write_rgb_f16(store_1, dest_ptr_1); + + let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..); + write_rgb_f16(store_2, dest_ptr_2); + + let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..); + write_rgb_f16(store_3, dest_ptr_3); + + filter_offset += filter_weights.aligned_size; + } +} + +pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16_fhm( + dst_width: usize, + w: usize, + filter_weights: &FilterWeights, + src: &[f16], + dst: &mut [f16], +) { + unsafe { convolve_horizontal_rgb_neon_row_one_f16_impl(dst_width, w, filter_weights, src, dst) } +} + +#[target_feature(enable = "fhm")] +unsafe fn convolve_horizontal_rgb_neon_row_one_f16_impl( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + dst: &mut [f16], +) { + const CHANNELS: usize = 3; + let weights_ptr = filter_weights.weights.as_ptr(); + let mut filter_offset = 0usize; + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_f32(0f32); + + while jx + 4 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = xvld_f16(ptr); + store = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store); + jx += 4; + } + + while jx + 2 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = + xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _))); + store = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _)); + let bounds_start = bounds.start + jx; + store = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..); + write_rgb_f16(store, dest_ptr); + + filter_offset += filter_weights.aligned_size; + } +} diff --git a/src/neon/rgb_f16_full.rs b/src/neon/rgb_f16_full.rs index 031fd92..b30d07f 100644 --- a/src/neon/rgb_f16_full.rs +++ b/src/neon/rgb_f16_full.rs @@ -29,7 +29,7 @@ use std::arch::aarch64::*; -use half::f16; +use core::f16; use crate::filter_weights::FilterWeights; use crate::neon::*; @@ -132,9 +132,7 @@ unsafe fn conv_horiz_1_rgb_f16( rgb_pixel_u = vld1_lane_u16::<2>(src_ptr as *const _, rgb_pixel_u); let rgb_pixel = xreinterpret_f16_u16(rgb_pixel_u); - - let acc = xvfmla_f16(store, rgb_pixel, set); - acc + xvfmla_f16(store, rgb_pixel, set) } pub(crate) fn xconvolve_horizontal_rgb_neon_rows_4_f16( diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs index 550191c..88ba854 100644 --- a/src/neon/rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -28,13 +28,13 @@ */ use crate::filter_weights::FilterWeights; -use crate::neon::utils::load_3b_as_u16x4; -use crate::support::{PRECISION, ROUNDING_CONST}; +use crate::neon::utils::{load_3b_as_u16x4, vxmlal_high_lane_s16, vxmlal_lane_s16, vxmlal_s16}; +use crate::support::PRECISION; use std::arch::aarch64::*; #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_4_u8( +unsafe fn conv_horiz_rgb_4_u8( start_x: usize, src: &[u8], weights: int16x4_t, @@ -55,15 +55,15 @@ unsafe fn conv_horiz_rgba_4_u8( let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgb_pixel)); let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgb_pixel))); - let acc = vmlal_high_lane_s16::<3>(store, hi, weights); - let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights); - let acc = vmlal_high_lane_s16::<1>(acc, lo, weights); - vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights) + let acc = vxmlal_high_lane_s16::(store, hi, weights); + let acc = vxmlal_lane_s16::(acc, vget_low_s16(hi), weights); + let acc = vxmlal_high_lane_s16::(acc, lo, weights); + vxmlal_lane_s16::(acc, vget_low_s16(lo), weights) } #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_2_u8( +unsafe fn conv_horiz_rgba_2_u8( start_x: usize, src: &[u8], weights: int16x4_t, @@ -81,13 +81,13 @@ unsafe fn conv_horiz_rgba_2_u8( let wide = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(rgb_pixel))); - let acc = vmlal_high_lane_s16::<1>(store, wide, weights); - vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights) + let acc = vxmlal_high_lane_s16::(store, wide, weights); + vxmlal_lane_s16::(acc, vget_low_s16(wide), weights) } #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_1_u8( +unsafe fn conv_horiz_rgba_1_u8( start_x: usize, src: &[u8], w0: int16x4_t, @@ -97,11 +97,11 @@ unsafe fn conv_horiz_rgba_1_u8( let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); let rgb_pixel = load_3b_as_u16x4(src_ptr.as_ptr()); let lo = vreinterpret_s16_u16(rgb_pixel); - vmlal_s16(store, lo, w0) + vxmlal_s16::(store, lo, w0) } #[inline(always)] -unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) { +unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) { let store_16 = vqshrun_n_s32::(store); let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); vst1_lane_u16::<0>( @@ -117,6 +117,38 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4( dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, +) { + convolve_horizontal_rgb_neon_rows_4_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); +} + +pub(crate) fn convolve_horizontal_rgb_neon_rows_4_q( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, +) { + convolve_horizontal_rgb_neon_rows_4_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); +} + +fn convolve_horizontal_rgb_neon_rows_4_impl( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, ) { unsafe { let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255]; @@ -127,8 +159,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4( // (r0 g0 b0 r1) (g2 b2 r3 g3) (b3 r4 g4 b4) (r5 g5 b5 r6) + let rnd_const: i32 = (1 << (PRECISION - 1)) - 1; + const CHANNELS: usize = 3; - let init = vdupq_n_s32(ROUNDING_CONST); + let init = vdupq_n_s32(rnd_const); let (row0_ref, rest) = dst.split_at_mut(dst_stride); let (row1_ref, rest) = rest.split_at_mut(dst_stride); let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); @@ -164,10 +198,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4( let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 4)); let weights = vld1_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0, shuffle); - store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1, shuffle); - store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2, shuffle); - store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3, shuffle); + store_0 = conv_horiz_rgb_4_u8::(bounds_start, src0, weights, store_0, shuffle); + store_1 = conv_horiz_rgb_4_u8::(bounds_start, src1, weights, store_1, shuffle); + store_2 = conv_horiz_rgb_4_u8::(bounds_start, src2, weights, store_2, shuffle); + store_3 = conv_horiz_rgb_4_u8::(bounds_start, src3, weights, store_3, shuffle); jx += 4; } @@ -176,10 +210,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4( let bnds = bounds.start + jx; let mut v_weight = vld1_dup_s16(w_ptr.as_ptr()); v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight); - store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, store_0, shuffle_1); - store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, store_1, shuffle_1); - store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, store_2, shuffle_1); - store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, store_3, shuffle_1); + store_0 = conv_horiz_rgba_2_u8::(bnds, src0, v_weight, store_0, shuffle_1); + store_1 = conv_horiz_rgba_2_u8::(bnds, src1, v_weight, store_1, shuffle_1); + store_2 = conv_horiz_rgba_2_u8::(bnds, src2, v_weight, store_2, shuffle_1); + store_3 = conv_horiz_rgba_2_u8::(bnds, src3, v_weight, store_3, shuffle_1); jx += 2; } @@ -187,17 +221,17 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let bnds = bounds.start + jx; let weight0 = vld1_dup_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_1_u8(bnds, src0, weight0, store_0); - store_1 = conv_horiz_rgba_1_u8(bnds, src1, weight0, store_1); - store_2 = conv_horiz_rgba_1_u8(bnds, src2, weight0, store_2); - store_3 = conv_horiz_rgba_1_u8(bnds, src3, weight0, store_3); + store_0 = conv_horiz_rgba_1_u8::(bnds, src0, weight0, store_0); + store_1 = conv_horiz_rgba_1_u8::(bnds, src1, weight0, store_1); + store_2 = conv_horiz_rgba_1_u8::(bnds, src2, weight0, store_2); + store_3 = conv_horiz_rgba_1_u8::(bnds, src3, weight0, store_3); jx += 1; } - write_accumulator_u8(store_0, chunk0); - write_accumulator_u8(store_1, chunk1); - write_accumulator_u8(store_2, chunk2); - write_accumulator_u8(store_3, chunk3); + write_accumulator_u8::(store_0, chunk0); + write_accumulator_u8::(store_1, chunk1); + write_accumulator_u8::(store_2, chunk2); + write_accumulator_u8::(store_3, chunk3); } } } @@ -206,6 +240,22 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, +) { + convolve_horizontal_rgb_neon_row_one_impl::(src, dst, filter_weights); +} + +pub(crate) fn convolve_horizontal_rgb_neon_row_one_q( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, +) { + convolve_horizontal_rgb_neon_row_one_impl::(src, dst, filter_weights); +} + +fn convolve_horizontal_rgb_neon_row_one_impl( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, ) { unsafe { const CHANNELS: usize = 3; @@ -216,6 +266,8 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one( let shuffle_2 = vld1_u8(shuf_table_2.as_ptr()); let shuffle = vcombine_u8(shuffle_1, shuffle_2); + let rnd_const: i32 = (1 << (PRECISION - 1)) - 1; + for ((dst, bounds), weights) in dst .chunks_exact_mut(CHANNELS) .zip(filter_weights.bounds.iter()) @@ -228,13 +280,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one( let bounds_size = bounds.size; let mut jx = 0usize; - let mut store = vdupq_n_s32(ROUNDING_CONST); + let mut store = vdupq_n_s32(rnd_const); while jx + 4 < bounds_size { let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 4)); let weights = vld1_s16(w_ptr.as_ptr()); - store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store, shuffle); + store = conv_horiz_rgb_4_u8::(bounds_start, src, weights, store, shuffle); jx += 4; } @@ -242,7 +294,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one( let w_ptr = weights.get_unchecked(jx..(jx + 2)); let bounds_start = bounds.start + jx; let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const _)); - store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store, shuffle_1); + store = conv_horiz_rgba_2_u8::(bounds_start, src, v_weight, store, shuffle_1); jx += 2; } @@ -250,11 +302,11 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let weight0 = vld1_dup_s16(w_ptr.as_ptr()); let bnds = bounds.start + jx; - store = conv_horiz_rgba_1_u8(bnds, src, weight0, store); + store = conv_horiz_rgba_1_u8::(bnds, src, weight0, store); jx += 1; } - write_accumulator_u8(store, dst); + write_accumulator_u8::(store, dst); } } } diff --git a/src/neon/rgba_f16.rs b/src/neon/rgba_f16.rs index 3926a16..d69bf58 100644 --- a/src/neon/rgba_f16.rs +++ b/src/neon/rgba_f16.rs @@ -36,77 +36,90 @@ use crate::neon::{ xvcvt_f32_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4, xvst_f16, }; +use core::f16; use std::arch::aarch64::*; -macro_rules! conv_horiz_rgba_8_f16 { - ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{ - const COMPONENTS: usize = 4; - let src_ptr = $src.add($start_x * COMPONENTS); +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_8_f16( + start_x: usize, + src: &[f16], + set1: float32x4_t, + set2: float32x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); - let rgb_pixel = xvldq_f16_x4(src_ptr); + let rgb_pixel = xvldq_f16_x4(src_ptr); - let mut acc = - prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1); - acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); - acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1); - acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1); - acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2); - acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2); - acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2); - acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2); - acc - }}; + let mut acc = + prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), set1); + acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1); + acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), set1); + acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), set1); + acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), set2); + acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), set2); + acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), set2); + prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), set2) } -macro_rules! conv_horiz_rgba_4_f16 { - ($start_x: expr, $src: expr, $set1: expr, $store: expr) => {{ - const COMPONENTS: usize = 4; - let src_ptr = $src.add($start_x * COMPONENTS); +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_4_f16( + start_x: usize, + src: &[f16], + set1: float32x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); - let rgb_pixel = xvldq_f16_x2(src_ptr); + let rgb_pixel = xvldq_f16_x2(src_ptr); - let acc = - prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1); - let acc = - prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); - let acc = - prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1); - let acc = - prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); - acc - }}; + let acc = prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), set1); + let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1); + let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), set1); + prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1) } -macro_rules! conv_horiz_rgba_2_f32 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 4; - let src_ptr = $src.add($start_x * COMPONENTS); +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_2_f32( + start_x: usize, + src: &[f16], + set: float32x2_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); - let rgb_pixel = xvldq_f16(src_ptr); + let rgb_pixel = xvldq_f16(src_ptr); - let mut acc = - prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set); - acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set); - acc - }}; + let acc = prefer_vfmaq_lane_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), set); + prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), set) } -macro_rules! conv_horiz_rgba_1_f16 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 4; - let src_ptr = $src.add($start_x * COMPONENTS); - let rgb_pixel = xvld_f16(src_ptr); - let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_pixel), $set); - acc - }}; +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_1_f16( + start_x: usize, + src: &[f16], + set: float32x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + let rgb_pixel = xvld_f16(src_ptr); + prefer_vfmaq_f32(store, xvcvt_f32_f16(rgb_pixel), set) } pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16( dst_width: usize, _: usize, filter_weights: &FilterWeights, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], ) { unsafe { const CHANNELS: usize = 4; @@ -122,7 +135,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store); jx += 4; } @@ -130,7 +143,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store); jx += 2; } @@ -138,7 +151,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); - store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store); + store = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store); jx += 1; } @@ -155,9 +168,9 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( dst_width: usize, _: usize, filter_weights: &FilterWeights, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -178,36 +191,36 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvld1q_f32_x2(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_8_f16!( + store_0 = conv_horiz_rgba_8_f16( bounds_start, - src.as_ptr(), + src, read_weights.0, read_weights.1, - store_0 + store_0, ); - let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_rgba_8_f16!( + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_8_f16( bounds_start, s_ptr_1, read_weights.0, read_weights.1, - store_1 + store_1, ); - let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_rgba_8_f16!( + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_8_f16( bounds_start, s_ptr2, read_weights.0, read_weights.1, - store_2 + store_2, ); - let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_rgba_8_f16!( + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_8_f16( bounds_start, s_ptr3, read_weights.0, read_weights.1, - store_3 + store_3, ); jx += 8; } @@ -216,13 +229,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0); - let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3); + store_0 = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_4_f16(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_4_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_4_f16(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } @@ -230,13 +243,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0); - let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1); - let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2); - let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3); + store_0 = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_2_f32(bounds_start, ptr_1, read_weights, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_2_f32(bounds_start, ptr_2, read_weights, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_2_f32(bounds_start, ptr_3, read_weights, store_3); jx += 2; } @@ -244,13 +257,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0); - let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); - store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1); - let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); - store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2); - let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); - store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3); + store_0 = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_1_f16(bounds_start, ptr_1, weight0, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_1_f16(bounds_start, ptr_2, weight0, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_1_f16(bounds_start, ptr_3, weight0, store_3); jx += 1; } diff --git a/src/neon/rgba_f16_fhm.rs b/src/neon/rgba_f16_fhm.rs new file mode 100644 index 0000000..33514e3 --- /dev/null +++ b/src/neon/rgba_f16_fhm.rs @@ -0,0 +1,294 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::filter_weights::FilterWeights; +use crate::neon::f16_utils::{ + xvcombine_f16, xvcvt_f16_f32, xvfmlalq_lane_high_f16, xvfmlalq_lane_low_f16, + xvfmlalq_laneq_high_f16, xvfmlalq_laneq_low_f16, +}; +use crate::neon::{ + x_float16x4_t, x_float16x8_t, xreinterpret_f16_u16, xvld_f16, xvldq_f16, xvldq_f16_x2, + xvldq_f16_x4, xvst_f16, +}; +use core::f16; +use std::arch::aarch64::*; + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_8_f16( + start_x: usize, + src: &[f16], + w: x_float16x8_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel = xvldq_f16_x4(src_ptr); + + let mut acc = xvfmlalq_laneq_low_f16::<0>(store, rgb_pixel.0, w); + acc = xvfmlalq_laneq_high_f16::<1>(acc, rgb_pixel.0, w); + acc = xvfmlalq_laneq_low_f16::<2>(acc, rgb_pixel.1, w); + acc = xvfmlalq_laneq_high_f16::<3>(acc, rgb_pixel.1, w); + acc = xvfmlalq_laneq_low_f16::<4>(acc, rgb_pixel.2, w); + acc = xvfmlalq_laneq_high_f16::<5>(acc, rgb_pixel.2, w); + acc = xvfmlalq_laneq_low_f16::<6>(acc, rgb_pixel.3, w); + xvfmlalq_laneq_high_f16::<7>(acc, rgb_pixel.3, w) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_4_f16( + start_x: usize, + src: &[f16], + set1: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel = xvldq_f16_x2(src_ptr); + + let acc = xvfmlalq_lane_low_f16::<0>(store, rgb_pixel.0, set1); + let acc = xvfmlalq_lane_high_f16::<1>(acc, rgb_pixel.0, set1); + let acc = xvfmlalq_lane_low_f16::<2>(acc, rgb_pixel.1, set1); + xvfmlalq_lane_high_f16::<3>(acc, rgb_pixel.0, set1) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_2_f32( + start_x: usize, + src: &[f16], + set: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + + let rgb_pixel = xvldq_f16(src_ptr); + + let acc = xvfmlalq_lane_low_f16::<0>(store, rgb_pixel, set); + xvfmlalq_lane_high_f16::<1>(acc, rgb_pixel, set) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_1_f16( + start_x: usize, + src: &[f16], + set: x_float16x4_t, + store: float32x4_t, +) -> float32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); + let rgb_pixel = xvld_f16(src_ptr); + xvfmlalq_lane_low_f16::<0>(store, xvcombine_f16(rgb_pixel, rgb_pixel), set) +} + +pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16_fhm( + dst_width: usize, + w: usize, + filter_weights: &FilterWeights, + src: &[f16], + dst: &mut [f16], +) { + unsafe { + convolve_horizontal_rgba_neon_row_one_f16_impl(dst_width, w, filter_weights, src, dst) + } +} + +#[target_feature(enable = "fhm")] +unsafe fn convolve_horizontal_rgba_neon_row_one_f16_impl( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + dst: &mut [f16], +) { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_f32(0f32); + + while jx + 4 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = xvld_f16(ptr); + store = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store); + jx += 4; + } + + while jx + 2 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = + xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _))); + store = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store); + jx += 2; + } + + while jx < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _)); + store = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + xvst_f16(dest_ptr, xvcvt_f16_f32(store)); + + filter_offset += filter_weights.aligned_size; + } +} + +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16_fhm( + dst_width: usize, + w: usize, + filter_weights: &FilterWeights, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + dst_stride: usize, +) { + unsafe { + convolve_horizontal_rgba_neon_rows_4_f16_impl( + dst_width, + w, + filter_weights, + src, + src_stride, + dst, + dst_stride, + ) + } +} + +#[target_feature(enable = "fhm")] +unsafe fn convolve_horizontal_rgba_neon_rows_4_f16_impl( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + dst_stride: usize, +) { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let zeros = vdupq_n_f32(0f32); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 8 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = xvldq_f16(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_rgba_8_f16(bounds_start, src, read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_8_f16(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_8_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_8_f16(bounds_start, s_ptr3, read_weights, store_3); + jx += 8; + } + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = xvld_f16(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_4_f16(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_4_f16(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_4_f16(bounds_start, s_ptr3, read_weights, store_3); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = + xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _))); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_2_f32(bounds_start, ptr_1, read_weights, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_2_f32(bounds_start, ptr_2, read_weights, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_2_f32(bounds_start, ptr_3, read_weights, store_3); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _)); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..); + store_1 = conv_horiz_rgba_1_f16(bounds_start, ptr_1, weight0, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..); + store_2 = conv_horiz_rgba_1_f16(bounds_start, ptr_2, weight0, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..); + store_3 = conv_horiz_rgba_1_f16(bounds_start, ptr_3, weight0, store_3); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + xvst_f16(dest_ptr, xvcvt_f16_f32(store_0)); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); + xvst_f16(dest_ptr, xvcvt_f16_f32(store_1)); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); + xvst_f16(dest_ptr, xvcvt_f16_f32(store_2)); + + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); + xvst_f16(dest_ptr, xvcvt_f16_f32(store_3)); + + filter_offset += filter_weights.aligned_size; + } +} diff --git a/src/neon/rgba_f16_full.rs b/src/neon/rgba_f16_full.rs index 38fb45c..7f24625 100644 --- a/src/neon/rgba_f16_full.rs +++ b/src/neon/rgba_f16_full.rs @@ -36,7 +36,7 @@ use crate::neon::{ x_float16x4_t, x_float16x8_t, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4, xvst_f16, }; -use half::f16; +use core::f16; use std::arch::aarch64::*; #[must_use] @@ -117,8 +117,8 @@ pub(crate) fn xconvolve_horizontal_rgba_neon_row_one_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], ) { unsafe { xconvolve_horizontal_rgba_neon_row_one_f16_impl( @@ -136,8 +136,8 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], ) { const CHANNELS: usize = 4; let mut filter_offset = 0usize; @@ -185,9 +185,9 @@ pub(crate) fn xconvolve_horizontal_rgba_neon_rows_4_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: &[half::f16], + unsafe_source_ptr_0: &[f16], src_stride: usize, - unsafe_destination_ptr_0: &mut [half::f16], + unsafe_destination_ptr_0: &mut [f16], dst_stride: usize, ) { unsafe { @@ -208,9 +208,9 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, ) { const CHANNELS: usize = 4; diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs index 04aaa03..ce6b36e 100644 --- a/src/neon/rgba_u8.rs +++ b/src/neon/rgba_u8.rs @@ -29,15 +29,14 @@ use crate::filter_weights::FilterWeights; use crate::neon::utils::{ - expand8_high_to_14, expand8_to_14, load_4b_as_u16x4, load_4b_as_u8x8, xvld1q_u8_x2, + load_4b_as_u16x4, vxmlal_high_lane_s16, vxmlal_high_laneq_s16, vxmlal_lane_s16, + vxmlal_laneq_s16, vxmlal_s16, xvld1q_u8_x2, }; -use crate::support::PRECISION; -use crate::support::ROUNDING_CONST; use std::arch::aarch64::*; #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_8_u8( +unsafe fn conv_horiz_rgba_8_u8( start_x: usize, src: &[u8], weights: int16x8_t, @@ -53,48 +52,21 @@ unsafe fn conv_horiz_rgba_8_u8( let hi1 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1)); let lo1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1))); - let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights); - acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights); - acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights); - acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights); + let mut acc = vxmlal_high_laneq_s16::(store, hi0, weights); + acc = vxmlal_laneq_s16::(acc, vget_low_s16(hi0), weights); + acc = vxmlal_high_laneq_s16::(acc, lo0, weights); + acc = vxmlal_laneq_s16::(acc, vget_low_s16(lo0), weights); - acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights); - acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights); - acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights); - acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights); + acc = vxmlal_high_laneq_s16::(acc, hi1, weights); + acc = vxmlal_laneq_s16::(acc, vget_low_s16(hi1), weights); + acc = vxmlal_high_laneq_s16::(acc, lo1, weights); + acc = vxmlal_laneq_s16::(acc, vget_low_s16(lo1), weights); acc } #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_8_u8_i16( - start_x: usize, - src: &[u8], - w0: int16x8_t, - w1: int16x8_t, - w2: int16x8_t, - w3: int16x8_t, - store: int16x8_t, -) -> int16x8_t { - const COMPONENTS: usize = 4; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - - let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr()); - - let hi0 = expand8_high_to_14(rgba_pixel.0); - let lo0 = expand8_to_14(vget_low_u8(rgba_pixel.0)); - let hi1 = expand8_high_to_14(rgba_pixel.1); - let lo1 = expand8_to_14(vget_low_u8(rgba_pixel.1)); - - let mut p = vqrdmlahq_s16(store, lo0, w0); - p = vqrdmlahq_s16(p, hi0, w1); - p = vqrdmlahq_s16(p, lo1, w2); - vqrdmlahq_s16(p, hi1, w3) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_2_u8( +unsafe fn conv_horiz_rgba_2_u8( start_x: usize, src: &[u8], weights: int16x4_t, @@ -106,30 +78,13 @@ unsafe fn conv_horiz_rgba_2_u8( let rgb_pixel = vld1_u8(src_ptr.as_ptr()); let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel)); - let acc = vmlal_high_lane_s16::<1>(store, wide, weights); - vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_2_u8_i16( - start_x: usize, - src: &[u8], - weights: int16x8_t, - store: int16x8_t, -) -> int16x8_t { - const COMPONENTS: usize = 4; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - - let rgb_pixel = vld1_u8(src_ptr.as_ptr()); - let wide = expand8_to_14(rgb_pixel); - - vqrdmlahq_s16(store, wide, weights) + let acc = vxmlal_high_lane_s16::(store, wide, weights); + vxmlal_lane_s16::(acc, vget_low_s16(wide), weights) } #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_4_u8( +unsafe fn conv_horiz_rgba_4_u8( start_x: usize, src: &[u8], weights: int16x4_t, @@ -143,35 +98,15 @@ unsafe fn conv_horiz_rgba_4_u8( let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel)); let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel))); - let acc = vmlal_high_lane_s16::<3>(store, hi, weights); - let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights); - let acc = vmlal_high_lane_s16::<1>(acc, lo, weights); - vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights) -} - -#[inline(always)] -unsafe fn conv_horiz_rgba_4_u8_i16( - start_x: usize, - src: &[u8], - w0: int16x8_t, - w1: int16x8_t, - store: int16x8_t, -) -> int16x8_t { - const COMPONENTS: usize = 4; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - - let rgba_pixel = vld1q_u8(src_ptr.as_ptr()); - - let hi = expand8_high_to_14(rgba_pixel); - let lo = expand8_to_14(vget_low_u8(rgba_pixel)); - - let p = vqrdmlahq_s16(store, lo, w0); - vqrdmlahq_s16(p, hi, w1) + let acc = vxmlal_high_lane_s16::(store, hi, weights); + let acc = vxmlal_lane_s16::(acc, vget_low_s16(hi), weights); + let acc = vxmlal_high_lane_s16::(acc, lo, weights); + vxmlal_lane_s16::(acc, vget_low_s16(lo), weights) } #[must_use] #[inline(always)] -unsafe fn conv_horiz_rgba_1_u8( +unsafe fn conv_horiz_rgba_1_u8( start_x: usize, src: &[u8], w0: int16x4_t, @@ -181,225 +116,42 @@ unsafe fn conv_horiz_rgba_1_u8( let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); let rgba_pixel = load_4b_as_u16x4(src_ptr.as_ptr()); let lo = vreinterpret_s16_u16(rgba_pixel); - vmlal_s16(store, lo, w0) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_1_u8_i16( - start_x: usize, - src: &[u8], - w0: int16x4_t, - store: int16x4_t, -) -> int16x4_t { - const COMPONENTS: usize = 4; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let rgba_pixel = expand8_to_14(load_4b_as_u8x8(src_ptr.as_ptr())); - vqrdmlah_s16(store, vget_low_s16(rgba_pixel), w0) + vxmlal_s16::(store, lo, w0) } -/// Checking NEON `rdm` availability is required before a call. -/// -/// RDM feature has slightly lower precision and won't work really well on huge kernel which -/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. -/// -/// # Safety -/// - Check `rdm` availability before the call. -pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( src: &[u8], src_stride: usize, dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, ) { - unsafe { - convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); - } + convolve_horizontal_rgba_neon_rows_4_u8_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } -/// Slightly lower precision scale option -/// -/// # Safety -/// - Check `rdm` availability before the call. -#[target_feature(enable = "rdm")] -unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_q( src: &[u8], src_stride: usize, dst: &mut [u8], dst_stride: usize, filter_weights: &FilterWeights, ) { - const CHANNELS: usize = 4; - const SCALE: i32 = 6; - const ROUNDING: i16 = 1 << (SCALE - 1); - - let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; - let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr()); - let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; - let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr()); - let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11]; - let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr()); - let weights_distribute3: [u8; 16] = - [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15]; - let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr()); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0)); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - - let bounds_size = bounds.size; - - let mut store_0 = initial_val; - let mut store_1 = initial_val; - let mut store_2 = initial_val; - let mut store_3 = initial_val; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds_size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let weights_set = vld1q_s16(w_ptr.as_ptr()); - - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute0, - )); - let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute1, - )); - let w2 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute2, - )); - let w3 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute3, - )); - - store_0 = - conv_horiz_rgba_8_u8_i16::(bounds_start, src0, w0, w1, w2, w3, store_0); - store_1 = - conv_horiz_rgba_8_u8_i16::(bounds_start, src1, w0, w1, w2, w3, store_1); - store_2 = - conv_horiz_rgba_8_u8_i16::(bounds_start, src2, w0, w1, w2, w3, store_2); - store_3 = - conv_horiz_rgba_8_u8_i16::(bounds_start, src3, w0, w1, w2, w3, store_3); - jx += 8; - } - - while jx + 4 < bounds_size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vld1_s16(w_ptr.as_ptr()); - - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), - v_w_distribute0, - )); - let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), - v_w_distribute1, - )); - - store_0 = conv_horiz_rgba_4_u8_i16::(bounds_start, src0, w0, w1, store_0); - store_1 = conv_horiz_rgba_4_u8_i16::(bounds_start, src1, w0, w1, store_1); - store_2 = conv_horiz_rgba_4_u8_i16::(bounds_start, src2, w0, w1, store_2); - store_3 = conv_horiz_rgba_4_u8_i16::(bounds_start, src3, w0, w1, store_3); - jx += 4; - } - - while jx + 2 < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32)); - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))), - v_w_distribute0, - )); - store_0 = conv_horiz_rgba_2_u8_i16::(bounds_start, src0, w0, store_0); - store_1 = conv_horiz_rgba_2_u8_i16::(bounds_start, src1, w0, store_1); - store_2 = conv_horiz_rgba_2_u8_i16::(bounds_start, src2, w0, store_2); - store_3 = conv_horiz_rgba_2_u8_i16::(bounds_start, src3, w0, store_3); - jx += 2; - } - - let mut store_0 = vadd_s16(vget_low_s16(store_0), vget_high_s16(store_0)); - let mut store_1 = vadd_s16(vget_low_s16(store_1), vget_high_s16(store_1)); - let mut store_2 = vadd_s16(vget_low_s16(store_2), vget_high_s16(store_2)); - let mut store_3 = vadd_s16(vget_low_s16(store_3), vget_high_s16(store_3)); - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let bounds_start = bounds.start + jx; - let weight0 = vld1_dup_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_1_u8_i16::(bounds_start, src0, weight0, store_0); - store_1 = conv_horiz_rgba_1_u8_i16::(bounds_start, src1, weight0, store_1); - store_2 = conv_horiz_rgba_1_u8_i16::(bounds_start, src2, weight0, store_2); - store_3 = conv_horiz_rgba_1_u8_i16::(bounds_start, src3, weight0, store_3); - jx += 1; - } - - let store_16_0 = vshr_n_s16::(store_0); - let store_16_1 = vshr_n_s16::(store_1); - let store_16_2 = vshr_n_s16::(store_2); - let store_16_3 = vshr_n_s16::(store_3); - - let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0)); - let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1)); - let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2)); - let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3)); - - vst1_lane_u32::<0>( - chunk0.as_mut_ptr() as *mut u32, - vreinterpret_u32_u8(store_16_8_0), - ); - vst1_lane_u32::<0>( - chunk1.as_mut_ptr() as *mut u32, - vreinterpret_u32_u8(store_16_8_1), - ); - vst1_lane_u32::<0>( - chunk2.as_mut_ptr() as *mut u32, - vreinterpret_u32_u8(store_16_8_2), - ); - vst1_lane_u32::<0>( - chunk3.as_mut_ptr() as *mut u32, - vreinterpret_u32_u8(store_16_8), - ); - } + convolve_horizontal_rgba_neon_rows_4_u8_impl::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } -pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( +fn convolve_horizontal_rgba_neon_rows_4_u8_impl( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -408,7 +160,8 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( ) { unsafe { const CHANNELS: usize = 4; - let init = vdupq_n_s32(ROUNDING_CONST); + let rnd_const: i32 = (1 << (PRECISION - 1)) - 1; + let init = vdupq_n_s32(rnd_const); let (row0_ref, rest) = dst.split_at_mut(dst_stride); let (row1_ref, rest) = rest.split_at_mut(dst_stride); @@ -447,10 +200,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 8)); let weights_set = vld1q_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, weights_set, store_0); - store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, weights_set, store_1); - store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, weights_set, store_2); - store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, weights_set, store_3); + store_0 = conv_horiz_rgba_8_u8::(bounds_start, src0, weights_set, store_0); + store_1 = conv_horiz_rgba_8_u8::(bounds_start, src1, weights_set, store_1); + store_2 = conv_horiz_rgba_8_u8::(bounds_start, src2, weights_set, store_2); + store_3 = conv_horiz_rgba_8_u8::(bounds_start, src3, weights_set, store_3); jx += 8; } @@ -458,10 +211,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 4)); let weights = vld1_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0); - store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1); - store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2); - store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3); + store_0 = conv_horiz_rgba_4_u8::(bounds_start, src0, weights, store_0); + store_1 = conv_horiz_rgba_4_u8::(bounds_start, src1, weights, store_1); + store_2 = conv_horiz_rgba_4_u8::(bounds_start, src2, weights, store_2); + store_3 = conv_horiz_rgba_4_u8::(bounds_start, src3, weights, store_3); jx += 4; } @@ -470,10 +223,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( let bounds_start = bounds.start + jx; let mut v_weight = vld1_dup_s16(w_ptr.as_ptr()); v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight); - store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, v_weight, store_0); - store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, v_weight, store_1); - store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, v_weight, store_2); - store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, v_weight, store_3); + store_0 = conv_horiz_rgba_2_u8::(bounds_start, src0, v_weight, store_0); + store_1 = conv_horiz_rgba_2_u8::(bounds_start, src1, v_weight, store_1); + store_2 = conv_horiz_rgba_2_u8::(bounds_start, src2, v_weight, store_2); + store_3 = conv_horiz_rgba_2_u8::(bounds_start, src3, v_weight, store_3); jx += 2; } @@ -481,10 +234,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let bounds_start = bounds.start + jx; let weight0 = vld1_dup_s16(w_ptr.as_ptr()); - store_0 = conv_horiz_rgba_1_u8(bounds_start, src0, weight0, store_0); - store_1 = conv_horiz_rgba_1_u8(bounds_start, src1, weight0, store_1); - store_2 = conv_horiz_rgba_1_u8(bounds_start, src2, weight0, store_2); - store_3 = conv_horiz_rgba_1_u8(bounds_start, src3, weight0, store_3); + store_0 = conv_horiz_rgba_1_u8::(bounds_start, src0, weight0, store_0); + store_1 = conv_horiz_rgba_1_u8::(bounds_start, src1, weight0, store_1); + store_2 = conv_horiz_rgba_1_u8::(bounds_start, src2, weight0, store_2); + store_3 = conv_horiz_rgba_1_u8::(bounds_start, src3, weight0, store_3); jx += 1; } @@ -522,9 +275,26 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, +) { + convolve_horizontal_rgba_neon_row_impl::(src, dst, filter_weights); +} + +pub(crate) fn convolve_horizontal_rgba_neon_row_q( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, +) { + convolve_horizontal_rgba_neon_row_impl::(src, dst, filter_weights); +} + +fn convolve_horizontal_rgba_neon_row_impl( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, ) { unsafe { const CHANNELS: usize = 4; + let rnd_const: i32 = (1 << (PRECISION - 1)) - 1; for ((dst, bounds), weights) in dst .chunks_exact_mut(CHANNELS) @@ -537,13 +307,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( { let bounds_size = bounds.size; let mut jx = 0usize; - let mut store = vdupq_n_s32(ROUNDING_CONST); + let mut store = vdupq_n_s32(rnd_const); while jx + 8 < bounds_size { let bounds_start = bounds.start + jx; let w_ptr = weights.get_unchecked(jx..(jx + 8)); let weights_set = vld1q_s16(w_ptr.as_ptr()); - store = conv_horiz_rgba_8_u8(bounds_start, src, weights_set, store); + store = conv_horiz_rgba_8_u8::(bounds_start, src, weights_set, store); jx += 8; } @@ -551,7 +321,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( let w_ptr = weights.get_unchecked(jx..(jx + 4)); let weights = vld1_s16(w_ptr.as_ptr()); let bounds_start = bounds.start + jx; - store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store); + store = conv_horiz_rgba_4_u8::(bounds_start, src, weights, store); jx += 4; } @@ -560,7 +330,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( let bounds_start = bounds.start + jx; let mut v_weight = vld1_dup_s16(w_ptr.as_ptr()); v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight); - store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store); + store = conv_horiz_rgba_2_u8::(bounds_start, src, v_weight, store); jx += 2; } @@ -568,7 +338,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( let w_ptr = weights.get_unchecked(jx..(jx + 1)); let weight0 = vld1_dup_s16(w_ptr.as_ptr()); let bounds_start = bounds.start + jx; - store = conv_horiz_rgba_1_u8(bounds_start, src, weight0, store); + store = conv_horiz_rgba_1_u8::(bounds_start, src, weight0, store); jx += 1; } @@ -582,132 +352,3 @@ pub(crate) fn convolve_horizontal_rgba_neon_row( } } } - -/// Checking NEON `rdm` availability is required before a call. -/// -/// RDM feature has slightly lower precision and won't work really well on huge kernel which -/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. -/// -/// # Safety -/// - Check `rdm` availability before the call. -pub(crate) fn convolve_horizontal_rgba_neon_row_i16( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - unsafe { - convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights); - } -} - -#[target_feature(enable = "rdm")] -unsafe fn convolve_horizontal_rgba_neon_row_i16_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - const SCALE: i32 = 6; - const ROUNDING: i16 = 1 << (SCALE - 1); - const CHANNELS: usize = 4; - - let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; - let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr()); - let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; - let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr()); - let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11]; - let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr()); - let weights_distribute3: [u8; 16] = - [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15]; - let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr()); - - let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0)); - - for ((dst, bounds), weights) in dst - .chunks_exact_mut(CHANNELS) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let bounds_size = bounds.size; - let mut jx = 0usize; - let mut store = initial_val; - - while jx + 8 < bounds_size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let weights_set = vld1q_s16(w_ptr.as_ptr()); - - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute0, - )); - let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute1, - )); - let w2 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute2, - )); - let w3 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(weights_set), - v_w_distribute3, - )); - - store = conv_horiz_rgba_8_u8_i16::(bounds_start, src, w0, w1, w2, w3, store); - jx += 8; - } - - while jx + 4 < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vld1_s16(w_ptr.as_ptr()); - let bounds_start = bounds.start + jx; - - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), - v_w_distribute0, - )); - let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), - v_w_distribute1, - )); - - store = conv_horiz_rgba_4_u8_i16::(bounds_start, src, w0, w1, store); - jx += 4; - } - - while jx + 2 < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32)); - let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( - vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))), - v_w_distribute0, - )); - store = conv_horiz_rgba_2_u8_i16::(bounds_start, src, w0, store); - jx += 2; - } - - let mut store = vadd_s16(vget_low_s16(store), vget_high_s16(store)); - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let weight0 = vld1_dup_s16(w_ptr.as_ptr()); - let bounds_start = bounds.start + jx; - store = conv_horiz_rgba_1_u8_i16::(bounds_start, src, weight0, store); - jx += 1; - } - - let store_16 = vshr_n_s16::(store); - - let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16)); - - vst1_lane_u32::<0>( - dst.as_mut_ptr() as *mut u32, - vreinterpret_u32_u8(store_16_8), - ); - } -} diff --git a/src/neon/rgba_u8_rdm.rs b/src/neon/rgba_u8_rdm.rs new file mode 100644 index 0000000..591dc6d --- /dev/null +++ b/src/neon/rgba_u8_rdm.rs @@ -0,0 +1,440 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::filter_weights::FilterWeights; +use crate::neon::utils::{expand8_high_to_14, expand8_to_14, load_4b_as_u8x8, xvld1q_u8_x2}; +use std::arch::aarch64::*; + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_8_u8_i16( + start_x: usize, + src: &[u8], + w0: int16x8_t, + w1: int16x8_t, + w2: int16x8_t, + w3: int16x8_t, + store: int16x8_t, +) -> int16x8_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); + + let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr()); + + let hi0 = expand8_high_to_14(rgba_pixel.0); + let lo0 = expand8_to_14(vget_low_u8(rgba_pixel.0)); + let hi1 = expand8_high_to_14(rgba_pixel.1); + let lo1 = expand8_to_14(vget_low_u8(rgba_pixel.1)); + + let mut p = vqrdmlahq_s16(store, lo0, w0); + p = vqrdmlahq_s16(p, hi0, w1); + p = vqrdmlahq_s16(p, lo1, w2); + vqrdmlahq_s16(p, hi1, w3) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_2_u8_i16( + start_x: usize, + src: &[u8], + weights: int16x8_t, + store: int16x8_t, +) -> int16x8_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); + + let rgb_pixel = vld1_u8(src_ptr.as_ptr()); + let wide = expand8_to_14(rgb_pixel); + + vqrdmlahq_s16(store, wide, weights) +} + +#[inline(always)] +unsafe fn conv_horiz_rgba_4_u8_i16( + start_x: usize, + src: &[u8], + w0: int16x8_t, + w1: int16x8_t, + store: int16x8_t, +) -> int16x8_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); + + let rgba_pixel = vld1q_u8(src_ptr.as_ptr()); + + let hi = expand8_high_to_14(rgba_pixel); + let lo = expand8_to_14(vget_low_u8(rgba_pixel)); + + let p = vqrdmlahq_s16(store, lo, w0); + vqrdmlahq_s16(p, hi, w1) +} + +#[must_use] +#[inline(always)] +unsafe fn conv_horiz_rgba_1_u8_i16( + start_x: usize, + src: &[u8], + w0: int16x4_t, + store: int16x4_t, +) -> int16x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); + let rgba_pixel = expand8_to_14(load_4b_as_u8x8(src_ptr.as_ptr())); + vqrdmlah_s16(store, vget_low_s16(rgba_pixel), w0) +} + +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, +) { + unsafe { + convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); + } +} + +/// Slightly lower precision scale option +/// +/// # Safety +/// - Check `rdm` availability before the call. +#[target_feature(enable = "rdm")] +unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( + src: &[u8], + src_stride: usize, + dst: &mut [u8], + dst_stride: usize, + filter_weights: &FilterWeights, +) { + const CHANNELS: usize = 4; + const SCALE: i32 = 6; + const ROUNDING: i16 = 1 << (SCALE - 1); + + let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; + let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr()); + let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; + let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr()); + let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11]; + let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr()); + let weights_distribute3: [u8; 16] = + [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15]; + let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr()); + + let (row0_ref, rest) = dst.split_at_mut(dst_stride); + let (row1_ref, rest) = rest.split_at_mut(dst_stride); + let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); + + let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); + let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); + let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); + let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); + + let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0)); + + for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 + .zip(iter_row1) + .zip(iter_row2) + .zip(iter_row3) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let mut jx = 0usize; + + let bounds_size = bounds.size; + + let mut store_0 = initial_val; + let mut store_1 = initial_val; + let mut store_2 = initial_val; + let mut store_3 = initial_val; + + let src0 = src; + let src1 = src0.get_unchecked(src_stride..); + let src2 = src1.get_unchecked(src_stride..); + let src3 = src2.get_unchecked(src_stride..); + + while jx + 8 < bounds_size { + let bounds_start = bounds.start + jx; + let w_ptr = weights.get_unchecked(jx..(jx + 8)); + let weights_set = vld1q_s16(w_ptr.as_ptr()); + + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute0, + )); + let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute1, + )); + let w2 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute2, + )); + let w3 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute3, + )); + + store_0 = + conv_horiz_rgba_8_u8_i16::(bounds_start, src0, w0, w1, w2, w3, store_0); + store_1 = + conv_horiz_rgba_8_u8_i16::(bounds_start, src1, w0, w1, w2, w3, store_1); + store_2 = + conv_horiz_rgba_8_u8_i16::(bounds_start, src2, w0, w1, w2, w3, store_2); + store_3 = + conv_horiz_rgba_8_u8_i16::(bounds_start, src3, w0, w1, w2, w3, store_3); + jx += 8; + } + + while jx + 4 < bounds_size { + let bounds_start = bounds.start + jx; + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = vld1_s16(w_ptr.as_ptr()); + + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), + v_w_distribute0, + )); + let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), + v_w_distribute1, + )); + + store_0 = conv_horiz_rgba_4_u8_i16::(bounds_start, src0, w0, w1, store_0); + store_1 = conv_horiz_rgba_4_u8_i16::(bounds_start, src1, w0, w1, store_1); + store_2 = conv_horiz_rgba_4_u8_i16::(bounds_start, src2, w0, w1, store_2); + store_3 = conv_horiz_rgba_4_u8_i16::(bounds_start, src3, w0, w1, store_3); + jx += 4; + } + + while jx + 2 < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32)); + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))), + v_w_distribute0, + )); + store_0 = conv_horiz_rgba_2_u8_i16::(bounds_start, src0, w0, store_0); + store_1 = conv_horiz_rgba_2_u8_i16::(bounds_start, src1, w0, store_1); + store_2 = conv_horiz_rgba_2_u8_i16::(bounds_start, src2, w0, store_2); + store_3 = conv_horiz_rgba_2_u8_i16::(bounds_start, src3, w0, store_3); + jx += 2; + } + + let mut store_0 = vadd_s16(vget_low_s16(store_0), vget_high_s16(store_0)); + let mut store_1 = vadd_s16(vget_low_s16(store_1), vget_high_s16(store_1)); + let mut store_2 = vadd_s16(vget_low_s16(store_2), vget_high_s16(store_2)); + let mut store_3 = vadd_s16(vget_low_s16(store_3), vget_high_s16(store_3)); + + while jx < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let bounds_start = bounds.start + jx; + let weight0 = vld1_dup_s16(w_ptr.as_ptr()); + store_0 = conv_horiz_rgba_1_u8_i16::(bounds_start, src0, weight0, store_0); + store_1 = conv_horiz_rgba_1_u8_i16::(bounds_start, src1, weight0, store_1); + store_2 = conv_horiz_rgba_1_u8_i16::(bounds_start, src2, weight0, store_2); + store_3 = conv_horiz_rgba_1_u8_i16::(bounds_start, src3, weight0, store_3); + jx += 1; + } + + let store_16_0 = vshr_n_s16::(store_0); + let store_16_1 = vshr_n_s16::(store_1); + let store_16_2 = vshr_n_s16::(store_2); + let store_16_3 = vshr_n_s16::(store_3); + + let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0)); + let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1)); + let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2)); + let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3)); + + vst1_lane_u32::<0>( + chunk0.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_0), + ); + vst1_lane_u32::<0>( + chunk1.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_1), + ); + vst1_lane_u32::<0>( + chunk2.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_2), + ); + vst1_lane_u32::<0>( + chunk3.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); + } +} + +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. +pub(crate) fn convolve_horizontal_rgba_neon_row_i16( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, +) { + unsafe { + convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights); + } +} + +#[target_feature(enable = "rdm")] +unsafe fn convolve_horizontal_rgba_neon_row_i16_impl( + src: &[u8], + dst: &mut [u8], + filter_weights: &FilterWeights, +) { + const SCALE: i32 = 6; + const ROUNDING: i16 = 1 << (SCALE - 1); + const CHANNELS: usize = 4; + + let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; + let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr()); + let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; + let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr()); + let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11]; + let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr()); + let weights_distribute3: [u8; 16] = + [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15]; + let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr()); + + let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0)); + + for ((dst, bounds), weights) in dst + .chunks_exact_mut(CHANNELS) + .zip(filter_weights.bounds.iter()) + .zip( + filter_weights + .weights + .chunks_exact(filter_weights.aligned_size), + ) + { + let bounds_size = bounds.size; + let mut jx = 0usize; + let mut store = initial_val; + + while jx + 8 < bounds_size { + let bounds_start = bounds.start + jx; + let w_ptr = weights.get_unchecked(jx..(jx + 8)); + let weights_set = vld1q_s16(w_ptr.as_ptr()); + + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute0, + )); + let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute1, + )); + let w2 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute2, + )); + let w3 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(weights_set), + v_w_distribute3, + )); + + store = conv_horiz_rgba_8_u8_i16::(bounds_start, src, w0, w1, w2, w3, store); + jx += 8; + } + + while jx + 4 < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 4)); + let weights = vld1_s16(w_ptr.as_ptr()); + let bounds_start = bounds.start + jx; + + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), + v_w_distribute0, + )); + let w1 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))), + v_w_distribute1, + )); + + store = conv_horiz_rgba_4_u8_i16::(bounds_start, src, w0, w1, store); + jx += 4; + } + + while jx + 2 < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 2)); + let bounds_start = bounds.start + jx; + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32)); + let w0 = vreinterpretq_s16_u8(vqtbl1q_u8( + vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))), + v_w_distribute0, + )); + store = conv_horiz_rgba_2_u8_i16::(bounds_start, src, w0, store); + jx += 2; + } + + let mut store = vadd_s16(vget_low_s16(store), vget_high_s16(store)); + + while jx < bounds_size { + let w_ptr = weights.get_unchecked(jx..(jx + 1)); + let weight0 = vld1_dup_s16(w_ptr.as_ptr()); + let bounds_start = bounds.start + jx; + store = conv_horiz_rgba_1_u8_i16::(bounds_start, src, weight0, store); + jx += 1; + } + + let store_16 = vshr_n_s16::(store); + + let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16)); + + vst1_lane_u32::<0>( + dst.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); + } +} diff --git a/src/neon/utils.rs b/src/neon/utils.rs index a755d62..67b3546 100644 --- a/src/neon/utils.rs +++ b/src/neon/utils.rs @@ -30,12 +30,14 @@ use std::arch::aarch64::*; #[inline(always)] +#[cfg(feature = "rdm")] pub(crate) unsafe fn expand8_to_14(row: uint8x8_t) -> int16x8_t { let row = vcombine_u8(row, row); vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row)))) } #[inline(always)] +#[cfg(feature = "rdm")] pub(crate) unsafe fn expand8_high_to_14(row: uint8x16_t) -> int16x8_t { vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row)))) } @@ -155,6 +157,7 @@ pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { } #[inline(always)] +#[cfg(feature = "rdm")] pub(crate) unsafe fn load_3b_as_u8x16(src_ptr: *const u8) -> uint8x16_t { let v = vreinterpretq_u8_u16(vld1q_lane_u16::<0>(src_ptr as *const u16, vdupq_n_u16(0))); vld1q_lane_u8::<2>(src_ptr.add(2), v) @@ -167,6 +170,7 @@ pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { } #[inline(always)] +#[cfg(feature = "rdm")] pub(crate) unsafe fn load_4b_as_u8x8(src_ptr: *const u8) -> uint8x8_t { vreinterpret_u8_u32(vld1_lane_u32::<0>(src_ptr as *const u32, vdup_n_u32(0))) } @@ -179,6 +183,7 @@ pub(crate) unsafe fn xvld1q_s16_x2(a: *const i16) -> int16x8x2_t { } #[inline(always)] +#[cfg(feature = "rdm")] pub(crate) unsafe fn xvld1q_s16_x4(a: *const i16) -> int16x8x4_t { let v0 = vld1q_s16(a); let v1 = vld1q_s16(a.add(8)); @@ -186,3 +191,81 @@ pub(crate) unsafe fn xvld1q_s16_x4(a: *const i16) -> int16x8x4_t { let v3 = vld1q_s16(a.add(24)); int16x8x4_t(v0, v1, v2, v3) } + +#[inline(always)] +pub(crate) unsafe fn vxmlal_high_lane_s16( + a: int32x4_t, + b: int16x8_t, + c: int16x4_t, +) -> int32x4_t { + if D { + vqdmlal_high_lane_s16::(a, b, c) + } else { + vmlal_high_lane_s16::(a, b, c) + } +} + +#[inline(always)] +pub(crate) unsafe fn vxmlal_lane_s16( + a: int32x4_t, + b: int16x4_t, + c: int16x4_t, +) -> int32x4_t { + if D { + vqdmlal_lane_s16::(a, b, c) + } else { + vmlal_lane_s16::(a, b, c) + } +} + +#[inline(always)] +pub(crate) unsafe fn vxmlal_s16( + a: int32x4_t, + b: int16x4_t, + c: int16x4_t, +) -> int32x4_t { + if D { + vqdmlal_s16(a, b, c) + } else { + vmlal_s16(a, b, c) + } +} + +#[inline(always)] +pub(crate) unsafe fn vxmlal_high_s16( + a: int32x4_t, + b: int16x8_t, + c: int16x8_t, +) -> int32x4_t { + if D { + vqdmlal_high_s16(a, b, c) + } else { + vmlal_high_s16(a, b, c) + } +} + +#[inline(always)] +pub(crate) unsafe fn vxmlal_high_laneq_s16( + a: int32x4_t, + b: int16x8_t, + c: int16x8_t, +) -> int32x4_t { + if D { + vqdmlal_high_laneq_s16::(a, b, c) + } else { + vmlal_high_laneq_s16::(a, b, c) + } +} + +#[inline(always)] +pub unsafe fn vxmlal_laneq_s16( + a: int32x4_t, + b: int16x4_t, + c: int16x8_t, +) -> int32x4_t { + if D { + vqdmlal_laneq_s16::(a, b, c) + } else { + vmlal_laneq_s16::(a, b, c) + } +} diff --git a/src/neon/vertical_ar30.rs b/src/neon/vertical_ar30_rdm.rs similarity index 77% rename from src/neon/vertical_ar30.rs rename to src/neon/vertical_ar30_rdm.rs index b5c3f2f..c843c85 100644 --- a/src/neon/vertical_ar30.rs +++ b/src/neon/vertical_ar30_rdm.rs @@ -28,7 +28,7 @@ */ use crate::filter_weights::FilterBounds; use crate::fixed_point_vertical_ar30::convolve_column_handler_fip_db_ar30; -use crate::neon::ar30::{vunzip_4_ar30, vzip_4_ar30}; +use crate::neon::ar30::{vunzip_3_ar30, vzip_4_ar30}; use std::arch::aarch64::{ int16x8x4_t, vdupq_n_s16, vld1q_u32_x2, vmaxq_s16, vminq_s16, vqrdmlahq_s16, vqrdmulhq_s16, vrshrq_n_s16, vshlq_n_s16, vst1q_u32_x2, @@ -40,8 +40,8 @@ pub(crate) fn neon_column_handler_fixed_point_ar30< const AR30_ORDER: usize, >( bounds: &FilterBounds, - src: &[u32], - dst: &mut [u32], + src: &[u8], + dst: &mut [u8], src_stride: usize, weight: &[i16], ) { @@ -58,14 +58,14 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl< const AR30_ORDER: usize, >( bounds: &FilterBounds, - src: &[u32], - dst: &mut [u32], + src: &[u8], + dst: &mut [u8], src_stride: usize, weight: &[i16], ) { let mut cx = 0usize; - let total_width = dst.len(); + let total_width = dst.len() / 4; const PREC: i32 = 5; const BACK: i32 = 5; @@ -77,18 +77,18 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl< let v_max = vdupq_n_s16(1023); let zeros = vdupq_n_s16(0); let filter = weight; - let v_start_px = cx; + let v_start_px = cx * 4; let py = bounds.start; let weight = vdupq_n_s16(filter[0]); let offset = src_stride * py + v_start_px; let src_ptr = src.get_unchecked(offset..(offset + 8)); - let ps = vunzip_4_ar30::(vld1q_u32_x2(src_ptr.as_ptr())); + let ps = + vunzip_3_ar30::(vld1q_u32_x2(src_ptr.as_ptr() as *const _)); let mut v0 = vqrdmulhq_s16(vshlq_n_s16::(ps.0), weight); let mut v1 = vqrdmulhq_s16(vshlq_n_s16::(ps.1), weight); let mut v2 = vqrdmulhq_s16(vshlq_n_s16::(ps.2), weight); - let mut v3 = vqrdmulhq_s16(vshlq_n_s16::(ps.3), weight); if bounds_size == 2 { let weights = filter.get_unchecked(0..2); @@ -97,11 +97,12 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl< let v_weight1 = vdupq_n_s16(weights[1]); - let ps1 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr1.as_ptr())); + let ps1 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr1.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps1.0), v_weight1); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps1.1), v_weight1); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps1.2), v_weight1); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps1.3), v_weight1); } else if bounds_size == 3 { let weights = filter.get_unchecked(0..3); let py = bounds.start; @@ -111,16 +112,18 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl< let v_weight1 = vdupq_n_s16(weights[1]); let v_weight2 = vdupq_n_s16(weights[2]); - let ps1 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr1.as_ptr())); + let ps1 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr1.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps1.0), v_weight1); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps1.1), v_weight1); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps1.2), v_weight1); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps1.3), v_weight1); - let ps2 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr2.as_ptr())); + let ps2 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr2.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps2.0), v_weight2); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps2.1), v_weight2); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps2.2), v_weight2); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps2.3), v_weight2); } else if bounds_size == 4 { let weights = filter.get_unchecked(0..4); let py = bounds.start; @@ -132,46 +135,54 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl< let v_weight2 = vdupq_n_s16(weights[2]); let v_weight3 = vdupq_n_s16(weights[3]); - let ps1 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr1.as_ptr())); + let ps1 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr1.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps1.0), v_weight1); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps1.1), v_weight1); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps1.2), v_weight1); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps1.3), v_weight1); - let ps2 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr2.as_ptr())); + let ps2 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr2.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps2.0), v_weight2); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps2.1), v_weight2); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps2.2), v_weight2); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps2.3), v_weight2); - let ps3 = vunzip_4_ar30::(vld1q_u32_x2(src_ptr3.as_ptr())); + let ps3 = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr3.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps3.0), v_weight3); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps3.1), v_weight3); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps3.2), v_weight3); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps3.3), v_weight3); } else { for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() { // Adding 1 is necessary because skip do not incrementing value on values that skipped let py = bounds.start + j + 1; let weight = vdupq_n_s16(k_weight); let offset = src_stride * py + v_start_px; - let src_ptr = src.get_unchecked(offset..(offset + 8)); + let src_ptr = src.get_unchecked(offset..(offset + 8 * 4)); - let ps = vunzip_4_ar30::(vld1q_u32_x2(src_ptr.as_ptr())); + let ps = vunzip_3_ar30::(vld1q_u32_x2( + src_ptr.as_ptr() as *const _ + )); v0 = vqrdmlahq_s16(v0, vshlq_n_s16::(ps.0), weight); v1 = vqrdmlahq_s16(v1, vshlq_n_s16::(ps.1), weight); v2 = vqrdmlahq_s16(v2, vshlq_n_s16::(ps.2), weight); - v3 = vqrdmlahq_s16(v3, vshlq_n_s16::(ps.3), weight); } } - let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8)); + let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8 * 4)); - v0 = vmaxq_s16(vminq_s16(vrshrq_n_s16::(v0), v_max), zeros); - v1 = vmaxq_s16(vminq_s16(vrshrq_n_s16::(v1), v_max), zeros); - v2 = vmaxq_s16(vminq_s16(vrshrq_n_s16::(v2), v_max), zeros); - v3 = vmaxq_s16(vrshrq_n_s16::(v3), zeros); + v0 = vrshrq_n_s16::(v0); + v1 = vrshrq_n_s16::(v1); + v2 = vrshrq_n_s16::(v2); - let vals = vzip_4_ar30::(int16x8x4_t(v0, v1, v2, v3)); - vst1q_u32_x2(v_dst.as_mut_ptr(), vals); + v0 = vmaxq_s16(vminq_s16(v0, v_max), zeros); + v1 = vmaxq_s16(vminq_s16(v1, v_max), zeros); + v2 = vmaxq_s16(vminq_s16(v2, v_max), zeros); + + let vals = + vzip_4_ar30::(int16x8x4_t(v0, v1, v2, vdupq_n_s16(3))); + vst1q_u32_x2(v_dst.as_mut_ptr() as *mut _, vals); } cx += 8; diff --git a/src/neon/vertical_f16.rs b/src/neon/vertical_f16.rs index 5a3fd5b..ce4496a 100644 --- a/src/neon/vertical_f16.rs +++ b/src/neon/vertical_f16.rs @@ -32,6 +32,7 @@ use crate::filter_weights::FilterBounds; use crate::neon::convolve_f16::convolve_vertical_part_neon_8_f16; use crate::neon::utils::prefer_vfmaq_f32; use crate::neon::*; +use core::f16; macro_rules! conv_vertical_part_neon_16_f16 { ($start_y: expr, $start_x: expr, $src: expr, $src_stride: expr, $dst: expr, $filter: expr, $bounds: expr) => {{ @@ -239,8 +240,8 @@ macro_rules! conv_vertical_part_neon_48_f16 { pub(crate) fn convolve_vertical_rgb_neon_row_f16( _: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { diff --git a/src/neon/vertical_f16_fhm.rs b/src/neon/vertical_f16_fhm.rs new file mode 100644 index 0000000..482d110 --- /dev/null +++ b/src/neon/vertical_f16_fhm.rs @@ -0,0 +1,323 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use std::arch::aarch64::*; + +use crate::filter_weights::FilterBounds; +use crate::neon::*; +use core::f16; + +#[inline(always)] +pub(crate) unsafe fn conv_vertical_part_neon_16_f16( + start_y: usize, + start_x: usize, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + filter: &[f16], + bounds: &FilterBounds, +) { + unsafe { + let mut store_0 = vdupq_n_f32(0.); + let mut store_1 = vdupq_n_f32(0.); + let mut store_2 = vdupq_n_f32(0.); + let mut store_3 = vdupq_n_f32(0.); + + let px = start_x; + + for j in 0..bounds.size { + let py = start_y + j; + let v_weight = xreinterpretq_f16_u16(vld1q_dup_u16( + filter.get_unchecked(j..).as_ptr() as *const _ + )); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); + + let s_ptr = src_ptr.add(px); + let item_row = xvldq_f16_x2(s_ptr); + + store_0 = xvfmlalq_low_f16(store_0, item_row.0, v_weight); + store_1 = xvfmlalq_high_f16(store_1, item_row.0, v_weight); + store_2 = xvfmlalq_low_f16(store_2, item_row.1, v_weight); + store_3 = xvfmlalq_high_f16(store_3, item_row.1, v_weight); + } + + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + let f_set = x_float16x8x2_t( + xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), + xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), + ); + xvstq_f16_x2(dst_ptr, f_set); + } +} + +#[inline(always)] +pub(crate) unsafe fn conv_vertical_part_neon_32_f16( + start_y: usize, + start_x: usize, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + filter: &[f16], + bounds: &FilterBounds, +) { + let mut store_0 = vdupq_n_f32(0.); + let mut store_1 = vdupq_n_f32(0.); + let mut store_2 = vdupq_n_f32(0.); + let mut store_3 = vdupq_n_f32(0.); + let mut store_4 = vdupq_n_f32(0.); + let mut store_5 = vdupq_n_f32(0.); + let mut store_6 = vdupq_n_f32(0.); + let mut store_7 = vdupq_n_f32(0.); + + let px = start_x; + + for j in 0..bounds.size { + let py = start_y + j; + let v_weight = + xreinterpretq_f16_u16(vld1q_dup_u16(filter.get_unchecked(j..).as_ptr() as *const _)); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); + + let s_ptr = src_ptr.add(px); + let item_row = xvldq_f16_x4(s_ptr); + + store_0 = xvfmlalq_low_f16(store_0, item_row.0, v_weight); + store_1 = xvfmlalq_high_f16(store_1, item_row.0, v_weight); + store_2 = xvfmlalq_low_f16(store_2, item_row.1, v_weight); + store_3 = xvfmlalq_high_f16(store_3, item_row.1, v_weight); + + store_4 = xvfmlalq_low_f16(store_4, item_row.2, v_weight); + store_5 = xvfmlalq_high_f16(store_5, item_row.2, v_weight); + store_6 = xvfmlalq_low_f16(store_6, item_row.3, v_weight); + store_7 = xvfmlalq_high_f16(store_7, item_row.3, v_weight); + } + + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + let f_set = x_float16x8x4_t( + xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), + xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), + xcombine_f16(xvcvt_f16_f32(store_4), xvcvt_f16_f32(store_5)), + xcombine_f16(xvcvt_f16_f32(store_6), xvcvt_f16_f32(store_7)), + ); + xvstq_f16_x4(dst_ptr, f_set); +} + +#[inline(always)] +pub(crate) unsafe fn conv_vertical_part_neon_48_f16( + start_y: usize, + start_x: usize, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + filter: &[f16], + bounds: &FilterBounds, +) { + unsafe { + let mut store_0 = vdupq_n_f32(0.); + let mut store_1 = vdupq_n_f32(0.); + let mut store_2 = vdupq_n_f32(0.); + let mut store_3 = vdupq_n_f32(0.); + + let mut store_4 = vdupq_n_f32(0.); + let mut store_5 = vdupq_n_f32(0.); + let mut store_6 = vdupq_n_f32(0.); + let mut store_7 = vdupq_n_f32(0.); + + let mut store_8 = vdupq_n_f32(0.); + let mut store_9 = vdupq_n_f32(0.); + let mut store_10 = vdupq_n_f32(0.); + let mut store_11 = vdupq_n_f32(0.); + + let px = start_x; + + for j in 0..bounds.size { + let py = start_y + j; + let v_weight = xreinterpretq_f16_u16(vld1q_dup_u16( + filter.get_unchecked(j..).as_ptr() as *const _ + )); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); + + let s_ptr = src_ptr.add(px); + let item_row_0 = xvldq_f16_x4(s_ptr); + let item_row_1 = xvldq_f16_x2(s_ptr.add(32)); + + store_0 = xvfmlalq_low_f16(store_0, item_row_0.0, v_weight); + store_1 = xvfmlalq_high_f16(store_1, item_row_0.0, v_weight); + store_2 = xvfmlalq_low_f16(store_2, item_row_0.1, v_weight); + store_3 = xvfmlalq_high_f16(store_3, item_row_0.1, v_weight); + + store_4 = xvfmlalq_low_f16(store_4, item_row_0.2, v_weight); + store_5 = xvfmlalq_high_f16(store_5, item_row_0.2, v_weight); + store_6 = xvfmlalq_low_f16(store_6, item_row_0.3, v_weight); + store_7 = xvfmlalq_high_f16(store_7, item_row_0.3, v_weight); + + store_8 = xvfmlalq_low_f16(store_8, item_row_1.0, v_weight); + store_9 = xvfmlalq_high_f16(store_9, item_row_1.0, v_weight); + store_10 = xvfmlalq_low_f16(store_10, item_row_1.1, v_weight); + store_11 = xvfmlalq_high_f16(store_11, item_row_1.1, v_weight); + } + + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + let f_set = x_float16x8x4_t( + xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), + xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), + xcombine_f16(xvcvt_f16_f32(store_4), xvcvt_f16_f32(store_5)), + xcombine_f16(xvcvt_f16_f32(store_6), xvcvt_f16_f32(store_7)), + ); + xvstq_f16_x4(dst_ptr, f_set); + let dst_ptr2 = dst_ptr.add(32); + + let f_set1 = x_float16x8x2_t( + xcombine_f16(xvcvt_f16_f32(store_8), xvcvt_f16_f32(store_9)), + xcombine_f16(xvcvt_f16_f32(store_10), xvcvt_f16_f32(store_11)), + ); + xvstq_f16_x2(dst_ptr2, f_set1); + } +} + +pub(crate) fn convolve_vertical_rgb_neon_row_f16_fhm( + w0: usize, + bounds: &FilterBounds, + src: &[f16], + dst: &mut [f16], + src_stride: usize, + weight_ptr: &[f16], +) { + unsafe { convolve_vertical_rgb_neon_row_f16_impl(w0, bounds, src, dst, src_stride, weight_ptr) } +} + +#[inline(always)] +unsafe fn convolve_vertical_part_neon_8_f16_fhm( + start_y: usize, + start_x: usize, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + filter: &[f16], + bounds: &FilterBounds, + blend_length: usize, +) { + let mut store_0 = vdupq_n_f32(0f32); + let mut store_1 = vdupq_n_f32(0f32); + + let px = start_x; + + for j in 0..bounds.size { + let py = start_y + j; + let v_weight = + xreinterpretq_f16_u16(vld1q_dup_u16(filter.get_unchecked(j..).as_ptr() as *const _)); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); + + let s_ptr = src_ptr.add(px); + let item_row = if USE_BLENDING { + let mut transient: [f16; 8] = [0.; 8]; + std::ptr::copy_nonoverlapping(s_ptr, transient.as_mut_ptr(), blend_length); + xvldq_f16(transient.as_ptr()) + } else { + xvldq_f16(s_ptr) + }; + + store_0 = xvfmlalq_low_f16(store_0, item_row, v_weight); + store_1 = xvfmlalq_high_f16(store_1, item_row, v_weight); + } + + let item = xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)); + + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + if USE_BLENDING { + let mut transient: [f16; 8] = [0.; 8]; + xvstq_f16(transient.as_mut_ptr(), item); + std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, blend_length); + } else { + xvstq_f16(dst_ptr, item); + } +} + +#[target_feature(enable = "fhm")] +unsafe fn convolve_vertical_rgb_neon_row_f16_impl( + _: usize, + bounds: &FilterBounds, + src: &[f16], + dst: &mut [f16], + src_stride: usize, + weight_ptr: &[f16], +) { + let mut cx = 0usize; + let dst_width = dst.len(); + + while cx + 48 < dst_width { + conv_vertical_part_neon_48_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); + + cx += 48; + } + + while cx + 32 < dst_width { + conv_vertical_part_neon_32_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); + + cx += 32; + } + + while cx + 16 < dst_width { + conv_vertical_part_neon_16_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); + + cx += 16; + } + + while cx + 8 < dst_width { + unsafe { + convolve_vertical_part_neon_8_f16_fhm::( + bounds.start, + cx, + src, + src_stride, + dst, + weight_ptr, + bounds, + 8, + ); + } + + cx += 8; + } + + let left = dst_width - cx; + + if left > 0 { + unsafe { + convolve_vertical_part_neon_8_f16_fhm::( + bounds.start, + cx, + src, + src_stride, + dst, + weight_ptr, + bounds, + left, + ); + } + } +} diff --git a/src/neon/vertical_f16_full.rs b/src/neon/vertical_f16_full.rs index 3109835..bd317c0 100644 --- a/src/neon/vertical_f16_full.rs +++ b/src/neon/vertical_f16_full.rs @@ -30,14 +30,15 @@ use std::arch::aarch64::*; use crate::filter_weights::FilterBounds; use crate::neon::*; +use core::f16; #[inline(always)] pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, blend_length: usize, @@ -55,7 +56,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); - store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); - store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); + store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight); let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8))); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight); - store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); - store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight); + store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); + store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight); } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); let mut v_weight = vld1_dup_s16(weights.as_ptr()); @@ -100,26 +102,26 @@ pub(crate) fn convolve_column_lb_u16( let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr())); let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8))); - store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); - store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); - store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); + store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight); let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8))); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight); - store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); - store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight); + store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); + store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight); let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr())); let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8))); - store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight); - store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight); - store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight); - store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight); + store1 = vqdmlal_high_lane_s16::<2>(store1, item_row20, v_weight); + store2 = vqdmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight); + store3 = vqdmlal_high_lane_s16::<2>(store3, item_row21, v_weight); } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); @@ -134,34 +136,34 @@ pub(crate) fn convolve_column_lb_u16( let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr())); let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8))); - store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); - store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); - store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight); + store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight); let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8))); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight); - store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); - store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight); + store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight); + store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight); let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr())); let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8))); - store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight); - store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight); - store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight); - store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight); + store1 = vqdmlal_high_lane_s16::<2>(store1, item_row20, v_weight); + store2 = vqdmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight); + store3 = vqdmlal_high_lane_s16::<2>(store3, item_row21, v_weight); let item_row30 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr())); let item_row31 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr().add(8))); - store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight); - store1 = vmlal_high_lane_s16::<3>(store1, item_row30, v_weight); - store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight); - store3 = vmlal_high_lane_s16::<3>(store3, item_row31, v_weight); + store0 = vqdmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight); + store1 = vqdmlal_high_lane_s16::<3>(store1, item_row30, v_weight); + store2 = vqdmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight); + store3 = vqdmlal_high_lane_s16::<3>(store3, item_row31, v_weight); } else { for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { let py = bounds.start + j; @@ -179,20 +181,13 @@ pub(crate) fn convolve_column_lb_u16( } } - let item0 = vminq_u16( - vcombine_u16( - vqshrun_n_s32::(store0), - vqshrun_n_s32::(store1), - ), - v_max_colors, - ); - let item1 = vminq_u16( - vcombine_u16( - vqshrun_n_s32::(store2), - vqshrun_n_s32::(store3), - ), - v_max_colors, - ); + let store0 = vqshrun_n_s32::(store0); + let store1 = vqshrun_n_s32::(store1); + let store2 = vqshrun_n_s32::(store2); + let store3 = vqshrun_n_s32::(store3); + + let item0 = vminq_u16(vcombine_u16(store0, store1), v_max_colors); + let item1 = vminq_u16(vcombine_u16(store2, store3), v_max_colors); vst1q_u16(dst.as_mut_ptr(), item0); vst1q_u16(dst.as_mut_ptr().add(8), item1); @@ -222,13 +217,13 @@ pub(crate) fn convolve_column_lb_u16( let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight); } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); let mut v_weight = vld1_dup_s16(weights.as_ptr()); @@ -242,18 +237,18 @@ pub(crate) fn convolve_column_lb_u16( let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight); let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr())); - store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight); - store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight); + store1 = vqdmlal_high_lane_s16::<2>(store1, item_row2, v_weight); } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); let v_weight = vld1_s16(weights.as_ptr()); @@ -266,23 +261,23 @@ pub(crate) fn convolve_column_lb_u16( let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); - store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight); + store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight); let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); - store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight); + store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight); let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr())); - store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight); - store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight); + store1 = vqdmlal_high_lane_s16::<2>(store1, item_row2, v_weight); let item_row3 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr())); - store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight); - store1 = vmlal_high_lane_s16::<3>(store1, item_row3, v_weight); + store0 = vqdmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight); + store1 = vqdmlal_high_lane_s16::<3>(store1, item_row3, v_weight); } else { for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { let py = bounds.start + j; @@ -329,10 +324,10 @@ pub(crate) fn convolve_column_lb_u16( let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight); let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight); } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); let mut v_weight = vld1_dup_s16(weights.as_ptr()); @@ -345,13 +340,13 @@ pub(crate) fn convolve_column_lb_u16( let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight); let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight); let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr())); - store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, item_row2, v_weight); } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); let v_weight = vld1_s16(weights.as_ptr()); @@ -363,16 +358,16 @@ pub(crate) fn convolve_column_lb_u16( let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..); let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr())); - store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight); + store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight); let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr())); - store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight); + store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight); let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr())); - store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight); + store0 = vqdmlal_lane_s16::<2>(store0, item_row2, v_weight); let item_row3 = vreinterpret_s16_u16(vld1_u16(src_ptr3.as_ptr())); - store0 = vmlal_lane_s16::<3>(store0, item_row3, v_weight); + store0 = vqdmlal_lane_s16::<3>(store0, item_row3, v_weight); } else { for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { let py = bounds.start + j; @@ -415,8 +410,8 @@ pub(crate) fn convolve_column_lb_u16( let offset1 = src_stride * (py + 1) + v_px; let src_ptr1 = src.get_unchecked(offset1..(offset1 + 1)); - store0 += src_ptr0[0] as i32 * weight0 as i32; - store0 += src_ptr1[0] as i32 * weight1 as i32; + store0 += 2 * src_ptr0[0] as i32 * weight0 as i32; + store0 += 2 * src_ptr1[0] as i32 * weight1 as i32; } else if bounds_size == 3 { let weights = weight.get_unchecked(0..3); let weight0 = weights[0]; @@ -431,9 +426,9 @@ pub(crate) fn convolve_column_lb_u16( let offset2 = src_stride * (py + 2) + v_px; let src_ptr2 = src.get_unchecked(offset2..(offset2 + 1)); - store0 += src_ptr0[0] as i32 * weight0 as i32; - store0 += src_ptr1[0] as i32 * weight1 as i32; - store0 += src_ptr2[0] as i32 * weight2 as i32; + store0 += 2 * src_ptr0[0] as i32 * weight0 as i32; + store0 += 2 * src_ptr1[0] as i32 * weight1 as i32; + store0 += 2 * src_ptr2[0] as i32 * weight2 as i32; } else if bounds_size == 4 { let weights = weight.get_unchecked(0..4); let weight0 = weights[0]; @@ -451,17 +446,17 @@ pub(crate) fn convolve_column_lb_u16( let offset3 = src_stride * (py + 3) + v_px; let src_ptr3 = src.get_unchecked(offset3..(offset3 + 1)); - store0 += src_ptr0[0] as i32 * weight0 as i32; - store0 += src_ptr1[0] as i32 * weight1 as i32; - store0 += src_ptr2[0] as i32 * weight2 as i32; - store0 += src_ptr3[0] as i32 * weight3 as i32; + store0 += 2 * src_ptr0[0] as i32 * weight0 as i32; + store0 += 2 * src_ptr1[0] as i32 * weight1 as i32; + store0 += 2 * src_ptr2[0] as i32 * weight2 as i32; + store0 += 2 * src_ptr3[0] as i32 * weight3 as i32; } else { for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { let py = bounds.start + j; let offset = src_stride * py + v_px; let src_ptr = src.get_unchecked(offset..(offset + 1)); - store0 += src_ptr[0] as i32 * k_weight as i32; + store0 += 2 * src_ptr[0] as i32 * k_weight as i32; } } diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs index a803116..ce3cc35 100644 --- a/src/neon/vertical_u8.rs +++ b/src/neon/vertical_u8.rs @@ -27,56 +27,71 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::filter_weights::FilterBounds; -use crate::neon::utils::{expand8_to_14, xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4}; -use crate::support::{PRECISION, ROUNDING_CONST}; +use crate::neon::utils::{ + vxmlal_high_lane_s16, vxmlal_high_s16, vxmlal_lane_s16, vxmlal_s16, xvld1q_u8_x2, xvld1q_u8_x4, + xvst1q_u8_x2, xvst1q_u8_x4, +}; use std::arch::aarch64::*; -macro_rules! pack_weights { - ($store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr) => {{ - let low_u16 = vcombine_u16( - vqshrun_n_s32::($store_0), - vqshrun_n_s32::($store_1), - ); - let high_u16 = vcombine_u16( - vqshrun_n_s32::($store_2), - vqshrun_n_s32::($store_3), - ); - vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16)) - }}; +#[inline(always)] +unsafe fn pack_weights( + store_0: int32x4_t, + store_1: int32x4_t, + store_2: int32x4_t, + store_3: int32x4_t, +) -> uint8x16_t { + let low_u16 = vcombine_u16( + vqshrun_n_s32::(store_0), + vqshrun_n_s32::(store_1), + ); + let high_u16 = vcombine_u16( + vqshrun_n_s32::(store_2), + vqshrun_n_s32::(store_3), + ); + vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16)) } -macro_rules! accumulate_4_into { - ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr) => {{ - let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item))); - let high = vreinterpretq_s16_u16(vmovl_high_u8($item)); - - $store_0 = vmlal_s16($store_0, vget_low_s16(low), vget_low_s16($weight)); - $store_1 = vmlal_high_s16($store_1, low, $weight); - $store_2 = vmlal_s16($store_2, vget_low_s16(high), vget_low_s16($weight)); - $store_3 = vmlal_high_s16($store_3, high, $weight); - }}; +#[must_use] +#[inline(always)] +unsafe fn accumulate_4_into( + item: uint8x16_t, + store_0: int32x4_t, + store_1: int32x4_t, + store_2: int32x4_t, + store_3: int32x4_t, + weight: int16x8_t, +) -> (int32x4_t, int32x4_t, int32x4_t, int32x4_t) { + let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(item))); + let high = vreinterpretq_s16_u16(vmovl_high_u8(item)); + + let store_0 = vxmlal_s16::(store_0, vget_low_s16(low), vget_low_s16(weight)); + let store_1 = vxmlal_high_s16::(store_1, low, weight); + let store_2 = vxmlal_s16::(store_2, vget_low_s16(high), vget_low_s16(weight)); + let store_3 = vxmlal_high_s16::(store_3, high, weight); + (store_0, store_1, store_2, store_3) } -macro_rules! accumulate_4_into_lane { - ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr, $weight_pos: expr) => {{ - let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item))); - let high = vreinterpretq_s16_u16(vmovl_high_u8($item)); - - $store_0 = vmlal_lane_s16::<$weight_pos>($store_0, vget_low_s16(low), $weight); - $store_1 = vmlal_high_lane_s16::<$weight_pos>($store_1, low, $weight); - $store_2 = vmlal_lane_s16::<$weight_pos>($store_2, vget_low_s16(high), $weight); - $store_3 = vmlal_high_lane_s16::<$weight_pos>($store_3, high, $weight); - }}; +#[must_use] +#[inline(always)] +unsafe fn accumulate_4_into_lane( + item: uint8x16_t, + store_0: int32x4_t, + store_1: int32x4_t, + store_2: int32x4_t, + store_3: int32x4_t, + weight: int16x4_t, +) -> (int32x4_t, int32x4_t, int32x4_t, int32x4_t) { + let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(item))); + let high = vreinterpretq_s16_u16(vmovl_high_u8(item)); + + let store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low), weight); + let store_1 = vxmlal_high_lane_s16::(store_1, low, weight); + let store_2 = vxmlal_lane_s16::(store_2, vget_low_s16(high), weight); + let store_3 = vxmlal_high_lane_s16::(store_3, high, weight); + (store_0, store_1, store_2, store_3) } -/// Checking NEON `rdm` availability is required before a call. -/// -/// RDM feature has slightly lower precision and won't work really well on huge kernel which -/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. -/// -/// # Safety -/// - Check `rdm` availability before the call. -pub(crate) fn convolve_vertical_neon_i16_precision( +pub(crate) fn convolve_vertical_neon_i32_precision( width: usize, bounds: &FilterBounds, src: &[u8], @@ -84,12 +99,12 @@ pub(crate) fn convolve_vertical_neon_i16_precision( src_stride: usize, weight: &[i16], ) { - unsafe { - convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight); - } + convolve_vertical_neon_row_full::( + width, bounds, src, dst, src_stride, weight, + ); } -pub(crate) fn convolve_vertical_neon_i32_precision( +pub(crate) fn convolve_vertical_neon_i32_precision_d( width: usize, bounds: &FilterBounds, src: &[u8], @@ -97,553 +112,10 @@ pub(crate) fn convolve_vertical_neon_i32_precision( src_stride: usize, weight: &[i16], ) { - convolve_vertical_neon_row_full(width, bounds, src, dst, src_stride, weight); + convolve_vertical_neon_row_full::(width, bounds, src, dst, src_stride, weight); } -#[must_use] -#[inline(always)] -unsafe fn vdot( - store0: int16x8_t, - store1: int16x8_t, - row: uint8x16_t, - weight: int16x8_t, -) -> (int16x8_t, int16x8_t) { - let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row)))); - let store0 = vqrdmlahq_s16(store0, lo0, weight); - let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row)))); - let store1 = vqrdmlahq_s16(store1, hi0, weight); - (store0, store1) -} - -#[must_use] -#[inline(always)] -unsafe fn vdot_lane( - store0: int16x8_t, - store1: int16x8_t, - row: uint8x16_t, - weight: int16x4_t, -) -> (int16x8_t, int16x8_t) { - let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row)))); - let store0 = vqrdmlahq_lane_s16::(store0, lo0, weight); - let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row)))); - let store1 = vqrdmlahq_lane_s16::(store1, hi0, weight); - (store0, store1) -} - -#[target_feature(enable = "rdm")] -unsafe fn convolve_vertical_neon_row_upper( - _: usize, - bounds: &FilterBounds, - src: &[u8], - dst: &mut [u8], - src_stride: usize, - weight: &[i16], -) { - let mut cx = 0usize; - - let iter_64 = dst.chunks_exact_mut(64); - - let bounds_size = bounds.size; - const SCALE: i32 = 6; - const R_SHR_SCALE: i32 = SCALE; - const ROUNDING: i16 = 1 << (SCALE - 1); - - for dst in iter_64 { - let vld = vdupq_n_s16(ROUNDING); - - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let mut store_4 = vld; - let mut store_5 = vld; - let mut store_6 = vld; - let mut store_7 = vld; - - let px = cx; - - if bounds_size == 2 { - let py = bounds.start; - let weight = weight.get_unchecked(0..2); - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); - - let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); - } else if bounds_size == 3 { - let py = bounds.start; - let weight = weight.get_unchecked(0..3); - let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); - - let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); - - let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items2.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items2.3, v_weight); - } else if bounds_size == 4 { - let py = bounds.start; - let weight = weight.get_unchecked(0..4); - let v_weight = vld1_s16(weight.as_ptr()); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - - let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); - - let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); - - let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items2.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items2.3, v_weight); - - let items3 = xvld1q_u8_x4(src_ptr3.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items3.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items3.1, v_weight); - (store_4, store_5) = vdot_lane::(store_4, store_5, items3.2, v_weight); - (store_6, store_7) = vdot_lane::(store_6, store_7, items3.3, v_weight); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..); - let v_weight = vld1q_dup_s16(weight.as_ptr()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let items = xvld1q_u8_x4(src_ptr.as_ptr()); - - (store_0, store_1) = vdot::(store_0, store_1, items.0, v_weight); - (store_2, store_3) = vdot::(store_2, store_3, items.1, v_weight); - (store_4, store_5) = vdot::(store_4, store_5, items.2, v_weight); - (store_6, store_7) = vdot::(store_6, store_7, items.3, v_weight); - } - } - - let item00 = vqshrun_n_s16::(store_0); - let item01 = vqshrun_n_s16::(store_1); - let item10 = vqshrun_n_s16::(store_2); - let item11 = vqshrun_n_s16::(store_3); - let item20 = vqshrun_n_s16::(store_4); - let item21 = vqshrun_n_s16::(store_5); - let item30 = vqshrun_n_s16::(store_6); - let item31 = vqshrun_n_s16::(store_7); - let item0 = vcombine_u8(item00, item01); - let item1 = vcombine_u8(item10, item11); - let item2 = vcombine_u8(item20, item21); - let item3 = vcombine_u8(item30, item31); - - let dst_items = uint8x16x4_t(item0, item1, item2, item3); - xvst1q_u8_x4(dst.as_mut_ptr(), dst_items); - - cx += 64; - } - - let mut rem = dst.chunks_exact_mut(64).into_remainder(); - let iter_32 = rem.chunks_exact_mut(32); - - for dst in iter_32 { - let vld = vdupq_n_s16(ROUNDING); - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let px = cx; - - if bounds_size == 2 { - let py = bounds.start; - let weight = weight.get_unchecked(0..2); - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - - let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - } else if bounds_size == 3 { - let py = bounds.start; - let weight = weight.get_unchecked(0..3); - let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - - let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - - let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); - } else if bounds_size == 4 { - let py = bounds.start; - let weight = weight.get_unchecked(0..4); - let v_weight = vld1_s16(weight.as_ptr()); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - - let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); - - let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); - - let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); - - let items3 = xvld1q_u8_x2(src_ptr3.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, items3.0, v_weight); - (store_2, store_3) = vdot_lane::(store_2, store_3, items3.1, v_weight); - } else { - for j in 0..bounds.size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..); - let v_weight = vld1q_dup_s16(weight.as_ptr()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let items = xvld1q_u8_x2(src_ptr.as_ptr()); - - (store_0, store_1) = vdot::(store_0, store_1, items.0, v_weight); - (store_2, store_3) = vdot::(store_2, store_3, items.1, v_weight); - } - } - - let item00 = vqshrun_n_s16::(store_0); - let item01 = vqshrun_n_s16::(store_1); - let item10 = vqshrun_n_s16::(store_2); - let item11 = vqshrun_n_s16::(store_3); - let item0 = vcombine_u8(item00, item01); - let item1 = vcombine_u8(item10, item11); - - let dst_items = uint8x16x2_t(item0, item1); - xvst1q_u8_x2(dst.as_mut_ptr(), dst_items); - - cx += 32; - } - - rem = rem.chunks_exact_mut(32).into_remainder(); - let iter_16 = rem.chunks_exact_mut(16); - - for dst in iter_16 { - let vld = vdupq_n_s16(ROUNDING); - let mut store_0 = vld; - let mut store_1 = vld; - - let px = cx; - - if bounds_size == 2 { - let py = bounds.start; - let weight = weight.get_unchecked(0..2); - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let item0 = vld1q_u8(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); - - let item1 = vld1q_u8(src_ptr1.as_ptr()); - (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); - } else if bounds_size == 3 { - let py = bounds.start; - let weight = weight.get_unchecked(0..3); - let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let item0 = vld1q_u8(src_ptr0.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); - - let item1 = vld1q_u8(src_ptr1.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); - - let item2 = vld1q_u8(src_ptr2.as_ptr()); - - (store_0, store_1) = vdot_lane::(store_0, store_1, item2, v_weight); - } else if bounds_size == 4 { - let py = bounds.start; - let weight = weight.get_unchecked(0..4); - let v_weight = vld1_s16(weight.as_ptr()); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - - let item0 = vld1q_u8(src_ptr0.as_ptr()); - (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); - - let item1 = vld1q_u8(src_ptr1.as_ptr()); - (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); - - let item2 = vld1q_u8(src_ptr2.as_ptr()); - (store_0, store_1) = vdot_lane::(store_0, store_1, item2, v_weight); - - let item3 = vld1q_u8(src_ptr3.as_ptr()); - (store_0, store_1) = vdot_lane::(store_0, store_1, item3, v_weight); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..); - let v_weight = vld1q_dup_s16(weight.as_ptr()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row = vld1q_u8(src_ptr.as_ptr()); - - (store_0, store_1) = vdot::(store_0, store_1, item_row, v_weight); - } - } - - let item0 = vqshrun_n_s16::(store_0); - let item1 = vqshrun_n_s16::(store_1); - - vst1q_u8(dst.as_mut_ptr(), vcombine_u8(item0, item1)); - - cx += 16; - } - - rem = rem.chunks_exact_mut(16).into_remainder(); - let iter_8 = rem.chunks_exact_mut(8); - - for dst in iter_8 { - let vld = vdupq_n_s16(ROUNDING); - let mut store_0 = vld; - - let px = cx; - - if bounds_size == 2 { - let py = bounds.start; - let weight = weight.get_unchecked(0..2); - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let item0 = vld1_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(item0); - store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); - - let item1 = vld1_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(item1); - store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); - } else if bounds_size == 3 { - let py = bounds.start; - let weight = weight.get_unchecked(0..3); - let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let item0 = vld1_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(item0); - store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); - - let item1 = vld1_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(item1); - store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); - - let item2 = vld1_u8(src_ptr2.as_ptr()); - let low2 = expand8_to_14(item2); - store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight); - } else if bounds_size == 4 { - let py = bounds.start; - let weight = weight.get_unchecked(0..4); - let v_weight = vld1_s16(weight.as_ptr()); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - - let item0 = vld1_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(item0); - store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); - - let item1 = vld1_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(item1); - store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); - - let item2 = vld1_u8(src_ptr2.as_ptr()); - let low2 = expand8_to_14(item2); - store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight); - - let item3 = vld1_u8(src_ptr3.as_ptr()); - let low3 = expand8_to_14(item3); - store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..); - let v_weight = vld1q_dup_s16(weight.as_ptr()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row = vld1_u8(src_ptr.as_ptr()); - - let low = expand8_to_14(item_row); - store_0 = vqrdmlahq_s16(store_0, low, v_weight); - } - } - - let item = vqshrun_n_s16::(store_0); - vst1_u8(dst.as_mut_ptr(), item); - - cx += 8; - } - - rem = rem.chunks_exact_mut(8).into_remainder(); - let iter_1 = rem.iter_mut(); - - for dst in iter_1 { - let vld = vdupq_n_s16(ROUNDING); - let mut store = vld; - - let px = cx; - - if bounds_size == 2 { - let py = bounds.start; - let weight = weight.get_unchecked(0..2); - let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let items0 = vld1_dup_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(items0); - store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); - - let items1 = vld1_dup_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(items1); - store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); - } else if bounds_size == 3 { - let py = bounds.start; - let weight = weight.get_unchecked(0..3); - let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); - v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let items0 = vld1_dup_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(items0); - store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); - - let items1 = vld1_dup_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(items1); - store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); - - let items2 = vld1_dup_u8(src_ptr2.as_ptr()); - let low2 = expand8_to_14(items2); - store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight); - } else if bounds_size == 4 { - let py = bounds.start; - let weight = weight.get_unchecked(0..4); - let v_weight = vld1_s16(weight.as_ptr()); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - - let items0 = vld1_dup_u8(src_ptr0.as_ptr()); - let low0 = expand8_to_14(items0); - store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); - - let items1 = vld1_dup_u8(src_ptr1.as_ptr()); - let low1 = expand8_to_14(items1); - store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); - - let items2 = vld1_dup_u8(src_ptr2.as_ptr()); - let low2 = expand8_to_14(items2); - store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight); - - let items3 = vld1_dup_u8(src_ptr3.as_ptr()); - let low3 = expand8_to_14(items3); - store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..); - let v_weight = vld1q_dup_s16(weight.as_ptr()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row = vld1_dup_u8(src_ptr.as_ptr()); - - let low = expand8_to_14(item_row); - store = vqrdmlahq_s16(store, low, v_weight); - } - } - - let shrinked_store = vqshrun_n_s16::(store); - let value = vget_lane_u8::<0>(shrinked_store); - *dst = value; - cx += 1; - } -} - -fn convolve_vertical_neon_row_full( +fn convolve_vertical_neon_row_full( _: usize, bounds: &FilterBounds, src: &[u8], @@ -652,6 +124,7 @@ fn convolve_vertical_neon_row_full( weight: &[i16], ) { let mut cx = 0usize; + let rnd_const: i32 = (1 << (PRECISION - 1)) - 1; unsafe { let iter_64 = dst.chunks_exact_mut(64); @@ -659,7 +132,7 @@ fn convolve_vertical_neon_row_full( let bounds_size = bounds.size; for dst in iter_64 { - let vld = vdupq_n_s32(ROUNDING_CONST); + let vld = vdupq_n_s32(rnd_const); let mut store_0 = vld; let mut store_1 = vld; let mut store_2 = vld; @@ -691,24 +164,32 @@ fn convolve_vertical_neon_row_full( let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); - accumulate_4_into_lane!( - items0.2, store_8, store_9, store_10, store_11, v_weight, 0 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, ); - accumulate_4_into_lane!( - items0.3, store_12, store_13, store_14, store_15, v_weight, 0 + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items0.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items0.3, store_12, store_13, store_14, store_15, v_weight, ); let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); - accumulate_4_into_lane!( - items1.2, store_8, store_9, store_10, store_11, v_weight, 1 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, ); - accumulate_4_into_lane!( - items1.3, store_12, store_13, store_14, store_15, v_weight, 1 + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items1.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items1.3, store_12, store_13, store_14, store_15, v_weight, ); } else if bounds_size == 3 { let py = bounds.start; @@ -722,35 +203,47 @@ fn convolve_vertical_neon_row_full( let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); - accumulate_4_into_lane!( - items0.2, store_8, store_9, store_10, store_11, v_weight, 0 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, ); - accumulate_4_into_lane!( - items0.3, store_12, store_13, store_14, store_15, v_weight, 0 + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items0.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items0.3, store_12, store_13, store_14, store_15, v_weight, ); let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); - accumulate_4_into_lane!( - items1.2, store_8, store_9, store_10, store_11, v_weight, 1 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, ); - accumulate_4_into_lane!( - items1.3, store_12, store_13, store_14, store_15, v_weight, 1 + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items1.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items1.3, store_12, store_13, store_14, store_15, v_weight, ); let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); - accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2); - accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2); - accumulate_4_into_lane!( - items2.2, store_8, store_9, store_10, store_11, v_weight, 2 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items2.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items2.1, store_4, store_5, store_6, store_7, v_weight, ); - accumulate_4_into_lane!( - items2.3, store_12, store_13, store_14, store_15, v_weight, 2 + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items2.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items2.3, store_12, store_13, store_14, store_15, v_weight, ); } else if bounds_size == 4 { let py = bounds.start; @@ -763,46 +256,62 @@ fn convolve_vertical_neon_row_full( let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); - accumulate_4_into_lane!( - items0.2, store_8, store_9, store_10, store_11, v_weight, 0 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items0.2, store_8, store_9, store_10, store_11, v_weight, ); - accumulate_4_into_lane!( - items0.3, store_12, store_13, store_14, store_15, v_weight, 0 + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items0.3, store_12, store_13, store_14, store_15, v_weight, ); let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); - accumulate_4_into_lane!( - items1.2, store_8, store_9, store_10, store_11, v_weight, 1 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items1.2, store_8, store_9, store_10, store_11, v_weight, ); - accumulate_4_into_lane!( - items1.3, store_12, store_13, store_14, store_15, v_weight, 1 + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items1.3, store_12, store_13, store_14, store_15, v_weight, ); let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); - accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2); - accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2); - accumulate_4_into_lane!( - items2.2, store_8, store_9, store_10, store_11, v_weight, 2 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items2.0, store_0, store_1, store_2, store_3, v_weight, ); - accumulate_4_into_lane!( - items2.3, store_12, store_13, store_14, store_15, v_weight, 2 + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items2.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items2.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items2.3, store_12, store_13, store_14, store_15, v_weight, ); let items3 = xvld1q_u8_x4(src_ptr3.as_ptr()); - accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3); - accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3); - accumulate_4_into_lane!( - items3.2, store_8, store_9, store_10, store_11, v_weight, 3 + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items3.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items3.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::( + items3.2, store_8, store_9, store_10, store_11, v_weight, ); - accumulate_4_into_lane!( - items3.3, store_12, store_13, store_14, store_15, v_weight, 3 + (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::( + items3.3, store_12, store_13, store_14, store_15, v_weight, ); } else { for j in 0..bounds_size { @@ -812,17 +321,25 @@ fn convolve_vertical_neon_row_full( let src_ptr = src.get_unchecked((src_stride * py + px)..); let items = xvld1q_u8_x4(src_ptr.as_ptr()); - accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight); - accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight); - accumulate_4_into!(items.2, store_8, store_9, store_10, store_11, v_weight); - accumulate_4_into!(items.3, store_12, store_13, store_14, store_15, v_weight); + (store_0, store_1, store_2, store_3) = accumulate_4_into::( + items.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into::( + items.1, store_4, store_5, store_6, store_7, v_weight, + ); + (store_8, store_9, store_10, store_11) = accumulate_4_into::( + items.2, store_8, store_9, store_10, store_11, v_weight, + ); + (store_12, store_13, store_14, store_15) = accumulate_4_into::( + items.3, store_12, store_13, store_14, store_15, v_weight, + ); } } - let item_0 = pack_weights!(store_0, store_1, store_2, store_3); - let item_1 = pack_weights!(store_4, store_5, store_6, store_7); - let item_2 = pack_weights!(store_8, store_9, store_10, store_11); - let item_3 = pack_weights!(store_12, store_13, store_14, store_15); + let item_0 = pack_weights::(store_0, store_1, store_2, store_3); + let item_1 = pack_weights::(store_4, store_5, store_6, store_7); + let item_2 = pack_weights::(store_8, store_9, store_10, store_11); + let item_3 = pack_weights::(store_12, store_13, store_14, store_15); let dst_items = uint8x16x4_t(item_0, item_1, item_2, item_3); xvst1q_u8_x4(dst.as_mut_ptr(), dst_items); @@ -834,7 +351,7 @@ fn convolve_vertical_neon_row_full( let iter_32 = rem.chunks_exact_mut(32); for dst in iter_32 { - let vld = vdupq_n_s32(ROUNDING_CONST); + let vld = vdupq_n_s32(rnd_const); let mut store_0 = vld; let mut store_1 = vld; let mut store_2 = vld; @@ -854,13 +371,21 @@ fn convolve_vertical_neon_row_full( let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, + ); let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, + ); } else if bounds_size == 3 { let py = bounds.start; let weight = weight.get_unchecked(0..3); @@ -872,18 +397,30 @@ fn convolve_vertical_neon_row_full( let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, + ); let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, + ); let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); - accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2); - accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items2.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items2.1, store_4, store_5, store_6, store_7, v_weight, + ); } else if bounds_size == 4 { let py = bounds.start; let weight = weight.get_unchecked(0..4); @@ -894,23 +431,39 @@ fn convolve_vertical_neon_row_full( let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); - accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items0.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items0.1, store_4, store_5, store_6, store_7, v_weight, + ); let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); - accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items1.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items1.1, store_4, store_5, store_6, store_7, v_weight, + ); let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); - accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2); - accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items2.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items2.1, store_4, store_5, store_6, store_7, v_weight, + ); let items3 = xvld1q_u8_x2(src_ptr3.as_ptr()); - accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3); - accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + items3.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::( + items3.1, store_4, store_5, store_6, store_7, v_weight, + ); } else { for j in 0..bounds.size { let py = bounds.start + j; @@ -919,13 +472,17 @@ fn convolve_vertical_neon_row_full( let src_ptr = src.get_unchecked((src_stride * py + px)..); let items = xvld1q_u8_x2(src_ptr.as_ptr()); - accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight); - accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight); + (store_0, store_1, store_2, store_3) = accumulate_4_into::( + items.0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_4, store_5, store_6, store_7) = accumulate_4_into::( + items.1, store_4, store_5, store_6, store_7, v_weight, + ); } } - let item_0 = pack_weights!(store_0, store_1, store_2, store_3); - let item_1 = pack_weights!(store_4, store_5, store_6, store_7); + let item_0 = pack_weights::(store_0, store_1, store_2, store_3); + let item_1 = pack_weights::(store_4, store_5, store_6, store_7); let dst_items = uint8x16x2_t(item_0, item_1); xvst1q_u8_x2(dst.as_mut_ptr(), dst_items); @@ -937,7 +494,7 @@ fn convolve_vertical_neon_row_full( let iter_16 = rem.chunks_exact_mut(16); for dst in iter_16 { - let vld = vdupq_n_s32(ROUNDING_CONST); + let vld = vdupq_n_s32(rnd_const); let mut store_0 = vld; let mut store_1 = vld; let mut store_2 = vld; @@ -953,8 +510,12 @@ fn convolve_vertical_neon_row_full( let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); let item_row0 = vld1q_u8(src_ptr0.as_ptr()); let item_row1 = vld1q_u8(src_ptr1.as_ptr()); - accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row1, store_0, store_1, store_2, store_3, v_weight, + ); } else if bounds_size == 3 { let py = bounds.start; let weight = weight.get_unchecked(0..3); @@ -967,9 +528,15 @@ fn convolve_vertical_neon_row_full( let item_row0 = vld1q_u8(src_ptr0.as_ptr()); let item_row1 = vld1q_u8(src_ptr1.as_ptr()); let item_row2 = vld1q_u8(src_ptr2.as_ptr()); - accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row1, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row2, store_0, store_1, store_2, store_3, v_weight, + ); } else if bounds_size == 4 { let py = bounds.start; let weight = weight.get_unchecked(0..4); @@ -982,10 +549,18 @@ fn convolve_vertical_neon_row_full( let item_row1 = vld1q_u8(src_ptr1.as_ptr()); let item_row2 = vld1q_u8(src_ptr2.as_ptr()); let item_row3 = vld1q_u8(src_ptr3.as_ptr()); - accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0); - accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1); - accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2); - accumulate_4_into_lane!(item_row3, store_0, store_1, store_2, store_3, v_weight, 3); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row0, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row1, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row2, store_0, store_1, store_2, store_3, v_weight, + ); + (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::( + item_row3, store_0, store_1, store_2, store_3, v_weight, + ); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -993,11 +568,13 @@ fn convolve_vertical_neon_row_full( let v_weight = vld1q_dup_s16(weight.as_ptr()); let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row = vld1q_u8(src_ptr.as_ptr()); - accumulate_4_into!(item_row, store_0, store_1, store_2, store_3, v_weight); + (store_0, store_1, store_2, store_3) = accumulate_4_into::( + item_row, store_0, store_1, store_2, store_3, v_weight, + ); } } - let item = pack_weights!(store_0, store_1, store_2, store_3); + let item = pack_weights::(store_0, store_1, store_2, store_3); vst1q_u8(dst.as_mut_ptr(), item); @@ -1008,7 +585,7 @@ fn convolve_vertical_neon_row_full( let iter_8 = rem.chunks_exact_mut(8); for dst in iter_8 { - let vld = vdupq_n_s32(ROUNDING_CONST); + let vld = vdupq_n_s32(rnd_const); let mut store_0 = vld; let mut store_1 = vld; @@ -1025,10 +602,10 @@ fn convolve_vertical_neon_row_full( let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0)); let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); - store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight); - store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight); - store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight); - store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low0), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low0, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low1), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low1, v_weight); } else if bounds_size == 3 { let py = bounds.start; let weight = weight.get_unchecked(0..3); @@ -1045,12 +622,12 @@ fn convolve_vertical_neon_row_full( let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0)); let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2)); - store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight); - store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight); - store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight); - store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight); - store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight); - store_1 = vmlal_high_lane_s16::<3>(store_1, low2, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low0), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low0, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low1), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low1, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low2), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low2, v_weight); } else if bounds_size == 4 { let py = bounds.start; let weight = weight.get_unchecked(0..4); @@ -1068,14 +645,14 @@ fn convolve_vertical_neon_row_full( let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2)); let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3)); - store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight); - store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight); - store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight); - store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight); - store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight); - store_1 = vmlal_high_lane_s16::<2>(store_1, low2, v_weight); - store_0 = vmlal_lane_s16::<3>(store_0, vget_low_s16(low3), v_weight); - store_1 = vmlal_high_lane_s16::<3>(store_1, low3, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low0), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low0, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low1), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low1, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low2), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low2, v_weight); + store_0 = vxmlal_lane_s16::(store_0, vget_low_s16(low3), v_weight); + store_1 = vxmlal_high_lane_s16::(store_1, low3, v_weight); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -1085,8 +662,8 @@ fn convolve_vertical_neon_row_full( let item_row = vld1_u8(src_ptr.as_ptr()); let low = vreinterpretq_s16_u16(vmovl_u8(item_row)); - store_0 = vmlal_s16(store_0, vget_low_s16(low), vget_low_s16(v_weight)); - store_1 = vmlal_high_s16(store_1, low, v_weight); + store_0 = vxmlal_s16::(store_0, vget_low_s16(low), vget_low_s16(v_weight)); + store_1 = vxmlal_high_s16::(store_1, low, v_weight); } } @@ -1106,7 +683,7 @@ fn convolve_vertical_neon_row_full( let iter_1 = rem.iter_mut(); for dst in iter_1 { - let vld = vdupq_n_s32(ROUNDING_CONST); + let vld = vdupq_n_s32(rnd_const); let mut store = vld; let px = cx; @@ -1122,8 +699,8 @@ fn convolve_vertical_neon_row_full( let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0)); let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); - store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight); - store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low0), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low1), v_weight); } else if bounds_size == 3 { let py = bounds.start; let weight = weight.get_unchecked(0..3); @@ -1140,9 +717,9 @@ fn convolve_vertical_neon_row_full( let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0)); let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2)); - store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight); - store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight); - store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low0), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low1), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low2), v_weight); } else if bounds_size == 4 { let py = bounds.start; let weight = weight.get_unchecked(0..4); @@ -1160,10 +737,10 @@ fn convolve_vertical_neon_row_full( let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1)); let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2)); let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3)); - store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight); - store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight); - store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight); - store = vmlal_lane_s16::<3>(store, vget_low_s16(low3), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low0), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low1), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low2), v_weight); + store = vxmlal_lane_s16::(store, vget_low_s16(low3), v_weight); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -1173,7 +750,7 @@ fn convolve_vertical_neon_row_full( let item_row = vld1_dup_u8(src_ptr.as_ptr()); let low = vreinterpretq_s16_u16(vmovl_u8(item_row)); - store = vmlal_s16(store, vget_low_s16(low), vget_low_s16(v_weight)); + store = vxmlal_s16::(store, vget_low_s16(low), vget_low_s16(v_weight)); } } diff --git a/src/neon/vertical_u8_rdm.rs b/src/neon/vertical_u8_rdm.rs new file mode 100644 index 0000000..aeabdc3 --- /dev/null +++ b/src/neon/vertical_u8_rdm.rs @@ -0,0 +1,594 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use crate::filter_weights::FilterBounds; +use crate::neon::utils::{expand8_to_14, xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4}; +use std::arch::aarch64::*; + +/// Checking NEON `rdm` availability is required before a call. +/// +/// RDM feature has slightly lower precision and won't work really well on huge kernel which +/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. +/// +/// # Safety +/// - Check `rdm` availability before the call. +pub(crate) fn convolve_vertical_neon_i16_precision( + width: usize, + bounds: &FilterBounds, + src: &[u8], + dst: &mut [u8], + src_stride: usize, + weight: &[i16], +) { + unsafe { + convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight); + } +} + +#[must_use] +#[inline(always)] +unsafe fn vdot( + store0: int16x8_t, + store1: int16x8_t, + row: uint8x16_t, + weight: int16x8_t, +) -> (int16x8_t, int16x8_t) { + let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row)))); + let store0 = vqrdmlahq_s16(store0, lo0, weight); + let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row)))); + let store1 = vqrdmlahq_s16(store1, hi0, weight); + (store0, store1) +} + +#[must_use] +#[inline(always)] +unsafe fn vdot_lane( + store0: int16x8_t, + store1: int16x8_t, + row: uint8x16_t, + weight: int16x4_t, +) -> (int16x8_t, int16x8_t) { + let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row)))); + let store0 = vqrdmlahq_lane_s16::(store0, lo0, weight); + let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row)))); + let store1 = vqrdmlahq_lane_s16::(store1, hi0, weight); + (store0, store1) +} + +#[target_feature(enable = "rdm")] +unsafe fn convolve_vertical_neon_row_upper( + _: usize, + bounds: &FilterBounds, + src: &[u8], + dst: &mut [u8], + src_stride: usize, + weight: &[i16], +) { + let mut cx = 0usize; + + let iter_64 = dst.chunks_exact_mut(64); + + let bounds_size = bounds.size; + const SCALE: i32 = 6; + const R_SHR_SCALE: i32 = SCALE; + const ROUNDING: i16 = 1 << (SCALE - 1); + + for dst in iter_64 { + let vld = vdupq_n_s16(ROUNDING); + + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let mut store_4 = vld; + let mut store_5 = vld; + let mut store_6 = vld; + let mut store_7 = vld; + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weight = weight.get_unchecked(0..2); + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + + let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); + + let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); + } else if bounds_size == 3 { + let py = bounds.start; + let weight = weight.get_unchecked(0..3); + let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + + let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); + + let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); + + let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items2.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items2.3, v_weight); + } else if bounds_size == 4 { + let py = bounds.start; + let weight = weight.get_unchecked(0..4); + let v_weight = vld1_s16(weight.as_ptr()); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + + let items0 = xvld1q_u8_x4(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items0.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items0.3, v_weight); + + let items1 = xvld1q_u8_x4(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items1.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items1.3, v_weight); + + let items2 = xvld1q_u8_x4(src_ptr2.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items2.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items2.3, v_weight); + + let items3 = xvld1q_u8_x4(src_ptr3.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items3.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items3.1, v_weight); + (store_4, store_5) = vdot_lane::(store_4, store_5, items3.2, v_weight); + (store_6, store_7) = vdot_lane::(store_6, store_7, items3.3, v_weight); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..); + let v_weight = vld1q_dup_s16(weight.as_ptr()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let items = xvld1q_u8_x4(src_ptr.as_ptr()); + + (store_0, store_1) = vdot::(store_0, store_1, items.0, v_weight); + (store_2, store_3) = vdot::(store_2, store_3, items.1, v_weight); + (store_4, store_5) = vdot::(store_4, store_5, items.2, v_weight); + (store_6, store_7) = vdot::(store_6, store_7, items.3, v_weight); + } + } + + let item00 = vqshrun_n_s16::(store_0); + let item01 = vqshrun_n_s16::(store_1); + let item10 = vqshrun_n_s16::(store_2); + let item11 = vqshrun_n_s16::(store_3); + let item20 = vqshrun_n_s16::(store_4); + let item21 = vqshrun_n_s16::(store_5); + let item30 = vqshrun_n_s16::(store_6); + let item31 = vqshrun_n_s16::(store_7); + let item0 = vcombine_u8(item00, item01); + let item1 = vcombine_u8(item10, item11); + let item2 = vcombine_u8(item20, item21); + let item3 = vcombine_u8(item30, item31); + + let dst_items = uint8x16x4_t(item0, item1, item2, item3); + xvst1q_u8_x4(dst.as_mut_ptr(), dst_items); + + cx += 64; + } + + let mut rem = dst.chunks_exact_mut(64).into_remainder(); + let iter_32 = rem.chunks_exact_mut(32); + + for dst in iter_32 { + let vld = vdupq_n_s16(ROUNDING); + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weight = weight.get_unchecked(0..2); + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + + let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + + let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + } else if bounds_size == 3 { + let py = bounds.start; + let weight = weight.get_unchecked(0..3); + let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + + let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + + let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + + let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); + } else if bounds_size == 4 { + let py = bounds.start; + let weight = weight.get_unchecked(0..4); + let v_weight = vld1_s16(weight.as_ptr()); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + + let items0 = xvld1q_u8_x2(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items0.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items0.1, v_weight); + + let items1 = xvld1q_u8_x2(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items1.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items1.1, v_weight); + + let items2 = xvld1q_u8_x2(src_ptr2.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items2.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items2.1, v_weight); + + let items3 = xvld1q_u8_x2(src_ptr3.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, items3.0, v_weight); + (store_2, store_3) = vdot_lane::(store_2, store_3, items3.1, v_weight); + } else { + for j in 0..bounds.size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..); + let v_weight = vld1q_dup_s16(weight.as_ptr()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let items = xvld1q_u8_x2(src_ptr.as_ptr()); + + (store_0, store_1) = vdot::(store_0, store_1, items.0, v_weight); + (store_2, store_3) = vdot::(store_2, store_3, items.1, v_weight); + } + } + + let item00 = vqshrun_n_s16::(store_0); + let item01 = vqshrun_n_s16::(store_1); + let item10 = vqshrun_n_s16::(store_2); + let item11 = vqshrun_n_s16::(store_3); + let item0 = vcombine_u8(item00, item01); + let item1 = vcombine_u8(item10, item11); + + let dst_items = uint8x16x2_t(item0, item1); + xvst1q_u8_x2(dst.as_mut_ptr(), dst_items); + + cx += 32; + } + + rem = rem.chunks_exact_mut(32).into_remainder(); + let iter_16 = rem.chunks_exact_mut(16); + + for dst in iter_16 { + let vld = vdupq_n_s16(ROUNDING); + let mut store_0 = vld; + let mut store_1 = vld; + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weight = weight.get_unchecked(0..2); + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + + let item0 = vld1q_u8(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); + + let item1 = vld1q_u8(src_ptr1.as_ptr()); + (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); + } else if bounds_size == 3 { + let py = bounds.start; + let weight = weight.get_unchecked(0..3); + let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + + let item0 = vld1q_u8(src_ptr0.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); + + let item1 = vld1q_u8(src_ptr1.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); + + let item2 = vld1q_u8(src_ptr2.as_ptr()); + + (store_0, store_1) = vdot_lane::(store_0, store_1, item2, v_weight); + } else if bounds_size == 4 { + let py = bounds.start; + let weight = weight.get_unchecked(0..4); + let v_weight = vld1_s16(weight.as_ptr()); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + + let item0 = vld1q_u8(src_ptr0.as_ptr()); + (store_0, store_1) = vdot_lane::(store_0, store_1, item0, v_weight); + + let item1 = vld1q_u8(src_ptr1.as_ptr()); + (store_0, store_1) = vdot_lane::(store_0, store_1, item1, v_weight); + + let item2 = vld1q_u8(src_ptr2.as_ptr()); + (store_0, store_1) = vdot_lane::(store_0, store_1, item2, v_weight); + + let item3 = vld1q_u8(src_ptr3.as_ptr()); + (store_0, store_1) = vdot_lane::(store_0, store_1, item3, v_weight); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..); + let v_weight = vld1q_dup_s16(weight.as_ptr()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let item_row = vld1q_u8(src_ptr.as_ptr()); + + (store_0, store_1) = vdot::(store_0, store_1, item_row, v_weight); + } + } + + let item0 = vqshrun_n_s16::(store_0); + let item1 = vqshrun_n_s16::(store_1); + + vst1q_u8(dst.as_mut_ptr(), vcombine_u8(item0, item1)); + + cx += 16; + } + + rem = rem.chunks_exact_mut(16).into_remainder(); + let iter_8 = rem.chunks_exact_mut(8); + + for dst in iter_8 { + let vld = vdupq_n_s16(ROUNDING); + let mut store_0 = vld; + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weight = weight.get_unchecked(0..2); + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + + let item0 = vld1_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(item0); + store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); + + let item1 = vld1_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(item1); + store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); + } else if bounds_size == 3 { + let py = bounds.start; + let weight = weight.get_unchecked(0..3); + let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + + let item0 = vld1_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(item0); + store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); + + let item1 = vld1_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(item1); + store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); + + let item2 = vld1_u8(src_ptr2.as_ptr()); + let low2 = expand8_to_14(item2); + store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight); + } else if bounds_size == 4 { + let py = bounds.start; + let weight = weight.get_unchecked(0..4); + let v_weight = vld1_s16(weight.as_ptr()); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + + let item0 = vld1_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(item0); + store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight); + + let item1 = vld1_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(item1); + store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight); + + let item2 = vld1_u8(src_ptr2.as_ptr()); + let low2 = expand8_to_14(item2); + store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight); + + let item3 = vld1_u8(src_ptr3.as_ptr()); + let low3 = expand8_to_14(item3); + store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..); + let v_weight = vld1q_dup_s16(weight.as_ptr()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let item_row = vld1_u8(src_ptr.as_ptr()); + + let low = expand8_to_14(item_row); + store_0 = vqrdmlahq_s16(store_0, low, v_weight); + } + } + + let item = vqshrun_n_s16::(store_0); + vst1_u8(dst.as_mut_ptr(), item); + + cx += 8; + } + + rem = rem.chunks_exact_mut(8).into_remainder(); + let iter_1 = rem.iter_mut(); + + for dst in iter_1 { + let vld = vdupq_n_s16(ROUNDING); + let mut store = vld; + + let px = cx; + + if bounds_size == 2 { + let py = bounds.start; + let weight = weight.get_unchecked(0..2); + let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + + let items0 = vld1_dup_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(items0); + store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); + + let items1 = vld1_dup_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(items1); + store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); + } else if bounds_size == 3 { + let py = bounds.start; + let weight = weight.get_unchecked(0..3); + let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32)); + v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + + let items0 = vld1_dup_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(items0); + store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); + + let items1 = vld1_dup_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(items1); + store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); + + let items2 = vld1_dup_u8(src_ptr2.as_ptr()); + let low2 = expand8_to_14(items2); + store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight); + } else if bounds_size == 4 { + let py = bounds.start; + let weight = weight.get_unchecked(0..4); + let v_weight = vld1_s16(weight.as_ptr()); + let src_ptr0 = src.get_unchecked((src_stride * py + px)..); + let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + + let items0 = vld1_dup_u8(src_ptr0.as_ptr()); + let low0 = expand8_to_14(items0); + store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight); + + let items1 = vld1_dup_u8(src_ptr1.as_ptr()); + let low1 = expand8_to_14(items1); + store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight); + + let items2 = vld1_dup_u8(src_ptr2.as_ptr()); + let low2 = expand8_to_14(items2); + store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight); + + let items3 = vld1_dup_u8(src_ptr3.as_ptr()); + let low3 = expand8_to_14(items3); + store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight); + } else { + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..); + let v_weight = vld1q_dup_s16(weight.as_ptr()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let item_row = vld1_dup_u8(src_ptr.as_ptr()); + + let low = expand8_to_14(item_row); + store = vqrdmlahq_s16(store, low, v_weight); + } + } + + let shrinked_store = vqshrun_n_s16::(store); + let value = vget_lane_u8::<0>(shrinked_store); + *dst = value; + cx += 1; + } +} diff --git a/src/neon/weights.rs b/src/neon/weights.rs index 2f7e390..dbae834 100644 --- a/src/neon/weights.rs +++ b/src/neon/weights.rs @@ -32,12 +32,12 @@ use crate::neon::{xreinterpret_u16_f16, xreinterpretq_u16_f16}; use std::arch::aarch64::*; pub(crate) fn convert_weights_to_f16(weights: &[f32]) -> Vec { - unsafe { convert_weights_to_f16_impl(weights) } + unsafe { convert_weights_to_f16_impl::(weights) } } #[target_feature(enable = "fp16")] -unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec { - let mut new_weights = vec![0i16; weights.len()]; +unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec { + let mut new_weights = vec![J::default(); weights.len()]; for (dst, src) in new_weights.chunks_exact_mut(8).zip(weights.chunks_exact(8)) { let j = xvld1q_f32_x2(src.as_ptr()); @@ -68,3 +68,11 @@ unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec { new_weights } + +#[cfg(feature = "nightly_f16")] +use core::f16; + +#[cfg(feature = "nightly_f16")] +pub(crate) fn convert_weights_to_f16_fhm(weights: &[f32]) -> Vec { + unsafe { convert_weights_to_f16_impl(weights) } +} diff --git a/src/plane_f32.rs b/src/plane_f32.rs index d9cad85..49e1e93 100644 --- a/src/plane_f32.rs +++ b/src/plane_f32.rs @@ -28,7 +28,7 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::convolve_vertical_avx_row_f32; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::convolve_naive_f32::{ convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32, }; @@ -57,6 +57,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), @@ -96,6 +97,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/plane_u16.rs b/src/plane_u16.rs index bb86038..6cc8e7c 100644 --- a/src/plane_u16.rs +++ b/src/plane_u16.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; use crate::image_store::ImageStoreMut; @@ -41,6 +41,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _: ConvolutionOptions, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); } @@ -52,7 +53,8 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ) { - convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); + convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options); } } diff --git a/src/plane_u8.rs b/src/plane_u8.rs index c8074a1..1ca91de 100644 --- a/src/plane_u8.rs +++ b/src/plane_u8.rs @@ -28,7 +28,7 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp}; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ @@ -37,8 +37,6 @@ use crate::handler_provider::{ use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8}; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::{ convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8, @@ -56,6 +54,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.height as f32 / destination.height as f32; let mut _dispatcher_4_rows: Option< @@ -65,15 +64,31 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 1> { handle_fixed_row_u8::<1>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8); - _dispatcher_1_row = convolve_horizontal_plane_neon_row; - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - use crate::neon::{ - convolve_horizontal_plane_neon_rdm_row, - convolve_horizontal_plane_neon_rows_rdm_4_u8, - }; - _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_rdm_4_u8); - _dispatcher_1_row = convolve_horizontal_plane_neon_rdm_row; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::{ + convolve_horizontal_plane_neon_row_q, + convolve_horizontal_plane_neon_rows_4_u8_q, + }; + _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8_q); + _dispatcher_1_row = convolve_horizontal_plane_neon_row_q; + } + crate::WorkloadStrategy::PreferSpeed => { + _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8); + _dispatcher_1_row = convolve_horizontal_plane_neon_row; + #[cfg(feature = "rdm")] + if _scale_factor < 8. + && crate::cpu_features::is_aarch_rdm_supported() + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::neon::{ + convolve_horizontal_plane_neon_rdm_row, + convolve_horizontal_plane_neon_rows_rdm_4_u8, + }; + _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_rdm_4_u8); + _dispatcher_1_row = convolve_horizontal_plane_neon_rdm_row; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -81,7 +96,9 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 1> { if is_x86_feature_detected!("sse4.1") { _dispatcher_4_rows = Some(convolve_horizontal_plane_sse_rows_4_u8); _dispatcher_1_row = convolve_horizontal_plane_sse_row; - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { use crate::sse::{ convolve_horizontal_plane_sse_row_hrs, convolve_horizontal_plane_sse_rows_hrs_4_u8, @@ -109,6 +126,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 1> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.height as f32 / destination.height as f32; #[allow(clippy::type_complexity)] @@ -116,35 +134,56 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 1> { handle_fixed_column_u8; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - // For more downscaling better to use more precise version - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - _dispatcher = convolve_vertical_neon_i16_precision; - } else { - _dispatcher = convolve_vertical_neon_i32_precision; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::convolve_vertical_neon_i32_precision_d; + _dispatcher = convolve_vertical_neon_i32_precision_d; + } + crate::WorkloadStrategy::PreferSpeed => { + // For more downscaling better to use more precise version + #[cfg(feature = "rdm")] + if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + use crate::neon::convolve_vertical_neon_i16_precision; + _dispatcher = convolve_vertical_neon_i16_precision; + } else { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + #[cfg(not(feature = "rdm"))] + { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if is_x86_feature_detected!("sse4.1") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_sse_row_lp; } else { _dispatcher = convolve_vertical_sse_row; } } if is_x86_feature_detected!("avx2") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_avx_row_lp; } else { _dispatcher = convolve_vertical_avx_row; } } #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avx512bw") { - if _scale_factor < 8. { - use crate::avx512::convolve_vertical_avx512_row_lp; - _dispatcher = convolve_vertical_avx512_row_lp; - } + if std::arch::is_x86_feature_detected!("avx512bw") + && _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] diff --git a/src/resize_ar30.rs b/src/resize_ar30.rs index bf36dd2..281abf2 100644 --- a/src/resize_ar30.rs +++ b/src/resize_ar30.rs @@ -26,18 +26,21 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::convolution::ConvolutionOptions; use crate::dispatch_group_ar30::{ convolve_horizontal_dispatch_ar30, convolve_vertical_dispatch_ar30, }; use crate::nearest_sampler::resize_nearest; use crate::pic_scale_error::PicScaleError; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ResamplingFunction, Scaler}; +use crate::{ImageSize, PicScaleBufferMismatch, ResamplingFunction, Scaler}; pub(crate) fn resize_ar30_impl( - src: &[u32], + src: &[u8], + src_stride: usize, src_size: ImageSize, - dst: &mut [u32], + dst: &mut [u8], + dst_stride: usize, dst_size: ImageSize, scaler: &Scaler, ) -> Result<(), PicScaleError> { @@ -45,14 +48,40 @@ pub(crate) fn resize_ar30_impl( return Err(PicScaleError::ZeroImageDimensions); } - if check_image_size_overflow(src_size.width, src_size.height, 1) { + if check_image_size_overflow(src_size.width, src_size.height, 4) { return Err(PicScaleError::SourceImageIsTooLarge); } - if check_image_size_overflow(dst_size.width, dst_size.height, 1) { + if check_image_size_overflow(dst_size.width, dst_size.height, 4) { return Err(PicScaleError::DestinationImageIsTooLarge); } + if src.len() != src_stride * src_size.height { + return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { + expected: src_stride * src_size.height, + width: src_size.width, + height: src_size.height, + channels: 4, + slice_len: src.len(), + })); + } + if src_stride < src_size.width * 4 { + return Err(PicScaleError::InvalidStride(src_size.width * 4, src_stride)); + } + + if dst.len() != dst_stride * dst_size.height { + return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { + expected: dst_stride * dst_size.height, + width: dst_size.width, + height: dst_size.height, + channels: 4, + slice_len: dst.len(), + })); + } + if dst_stride < dst_size.width * 4 { + return Err(PicScaleError::InvalidStride(dst_size.width * 4, dst_stride)); + } + if src_size.width == dst_size.width && src_size.height == dst_size.height { for (src, dst) in src.iter().zip(dst.iter_mut()) { *dst = *src; @@ -65,12 +94,12 @@ pub(crate) fn resize_ar30_impl( .get_pool(ImageSize::new(dst_size.width, dst_size.height)); if scaler.function == ResamplingFunction::Nearest { - resize_nearest::( + resize_nearest::( src, - src_size.width, + src_stride, src_size.height, dst, - dst_size.width, + dst_stride, dst_size.height, &pool, ); @@ -81,46 +110,56 @@ pub(crate) fn resize_ar30_impl( let should_do_vertical = src_size.height != dst_size.height; assert!(should_do_horizontal || should_do_vertical); + let options = ConvolutionOptions::new(scaler.workload_strategy); + if should_do_vertical && !should_do_horizontal { let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height); convolve_vertical_dispatch_ar30::( src, - src_size.width, + src_stride, vertical_filters, dst, - src_size.width, + src_stride, &pool, + src_size.width, + options, ); return Ok(()); - } - - let working_store = if should_do_vertical { - let mut target = vec![0u32; src_size.width * dst_size.height]; + } else if should_do_horizontal && should_do_vertical { + let mut target = vec![0u8; src_size.width * dst_size.height * 4]; let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height); convolve_vertical_dispatch_ar30::( src, - src_size.width, + src_stride, vertical_filters, &mut target, - src_size.width, + src_size.width * 4, &pool, + src_size.width, + options, ); - std::borrow::Cow::Owned(target) + let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width); + convolve_horizontal_dispatch_ar30::( + &target, + src_size.width * 4, + horizontal_filters, + dst, + dst_stride, + &pool, + options, + ); } else { - std::borrow::Cow::Borrowed(src) - }; - - if should_do_horizontal { let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width); convolve_horizontal_dispatch_ar30::( - working_store.as_ref(), - src_size.width, + src, + src_stride, horizontal_filters, dst, - dst_size.width, + dst_stride, &pool, + options, ); } diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs index d02a216..8bab908 100644 --- a/src/rgb_f32.rs +++ b/src/rgb_f32.rs @@ -28,7 +28,7 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::convolve_vertical_avx_row_f32; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::convolve_naive_f32::*; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; @@ -58,6 +58,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), @@ -97,6 +98,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/rgb_u16.rs b/src/rgb_u16.rs index d420454..b44b093 100644 --- a/src/rgb_u16.rs +++ b/src/rgb_u16.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; use crate::image_store::ImageStoreMut; @@ -41,6 +41,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _: ConvolutionOptions, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); } @@ -52,7 +53,8 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ) { - convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); + convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options); } } diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index 5f541fc..1c73732 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -28,7 +28,7 @@ */ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp}; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ @@ -53,6 +53,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.width as f32 / destination.width as f32; let mut _dispatcher_4_rows: Option< @@ -63,15 +64,28 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4); - _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one; - if _scale_factor < 8.0 && crate::cpu_features::is_aarch_rdm_supported() { - use crate::neon::{ - convolve_horizontal_rgb_neon_rdm_row_one, - convolve_horizontal_rgb_neon_rdm_rows_4, - }; - _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rdm_rows_4); - _dispatcher_1_row = convolve_horizontal_rgb_neon_rdm_row_one; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::{ + convolve_horizontal_rgb_neon_row_one_q, + convolve_horizontal_rgb_neon_rows_4_q, + }; + _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_q); + _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one_q; + } + crate::WorkloadStrategy::PreferSpeed => { + _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4); + _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one; + #[cfg(feature = "rdm")] + if _scale_factor < 8.0 && crate::cpu_features::is_aarch_rdm_supported() { + use crate::neon::{ + convolve_horizontal_rgb_neon_rdm_row_one, + convolve_horizontal_rgb_neon_rdm_rows_4, + }; + _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rdm_rows_4); + _dispatcher_1_row = convolve_horizontal_rgb_neon_rdm_row_one; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -106,6 +120,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 3> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.height as f32 / destination.height as f32; #[allow(clippy::type_complexity)] @@ -113,35 +128,56 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 3> { handle_fixed_column_u8; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - // For more downscaling better to use more precise version - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - _dispatcher = convolve_vertical_neon_i16_precision; - } else { - _dispatcher = convolve_vertical_neon_i32_precision; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::convolve_vertical_neon_i32_precision_d; + _dispatcher = convolve_vertical_neon_i32_precision_d; + } + crate::WorkloadStrategy::PreferSpeed => { + // For more downscaling better to use more precise version + #[cfg(feature = "rdm")] + if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + use crate::neon::convolve_vertical_neon_i16_precision; + _dispatcher = convolve_vertical_neon_i16_precision; + } else { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + #[cfg(not(feature = "rdm"))] + { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if is_x86_feature_detected!("sse4.1") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_sse_row_lp; } else { _dispatcher = convolve_vertical_sse_row; } } if is_x86_feature_detected!("avx2") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_avx_row_lp; } else { _dispatcher = convolve_vertical_avx_row; } } #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avx512bw") { - if _scale_factor < 8. { - use crate::avx512::convolve_vertical_avx512_row_lp; - _dispatcher = convolve_vertical_avx512_row_lp; - } + if std::arch::is_x86_feature_detected!("avx512bw") + && _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs index d3cff0c..4a78f4d 100644 --- a/src/rgba_f32.rs +++ b/src/rgba_f32.rs @@ -31,7 +31,7 @@ use crate::avx2::{ convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32, convolve_vertical_avx_row_f32, }; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::convolve_naive_f32::{ convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32, }; @@ -53,6 +53,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), @@ -100,6 +101,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _: ConvolutionOptions, ) { #[allow(clippy::type_complexity)] let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/rgba_u16.rs b/src/rgba_u16.rs index 613bc19..2bfc7cd 100644 --- a/src/rgba_u16.rs +++ b/src/rgba_u16.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #![forbid(unsafe_code)] -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; use crate::image_store::ImageStoreMut; @@ -41,6 +41,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _: ConvolutionOptions, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); } @@ -52,7 +53,8 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + options: ConvolutionOptions, ) { - convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); + convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options); } } diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index 0c57593..fa57723 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -33,7 +33,7 @@ use crate::avx2::{ convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb, convolve_vertical_avx_row, convolve_vertical_avx_row_lp, }; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights}; use crate::handler_provider::{ @@ -60,6 +60,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, _pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.width as f32 / destination.width as f32; let mut _dispatcher_4_rows: Option< @@ -69,18 +70,38 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { handle_fixed_row_u8::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16); - _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16; - } else { - _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8); - _dispatcher_1_row = convolve_horizontal_rgba_neon_row; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::{ + convolve_horizontal_rgba_neon_row_q, + convolve_horizontal_rgba_neon_rows_4_u8_q, + }; + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_q); + _dispatcher_1_row = convolve_horizontal_rgba_neon_row_q; + } + crate::WorkloadStrategy::PreferSpeed => { + #[cfg(feature = "rdm")] + if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16); + _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16; + } else { + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8); + _dispatcher_1_row = convolve_horizontal_rgba_neon_row; + } + #[cfg(not(feature = "rdm"))] + { + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8); + _dispatcher_1_row = convolve_horizontal_rgba_neon_row; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if std::arch::is_x86_feature_detected!("sse4.1") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_lb); _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one_lb; } else { @@ -88,7 +109,9 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one; } } - if std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8. { + if (std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8.) + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_lb); _dispatcher_1_row = convolve_horizontal_rgba_avx_rows_one_lb; } @@ -121,6 +144,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { filter_weights: FilterWeights, destination: &mut ImageStoreMut, pool: &Option, + _options: ConvolutionOptions, ) { let _scale_factor = self.height as f32 / destination.height as f32; #[allow(clippy::type_complexity)] @@ -128,35 +152,56 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { handle_fixed_column_u8; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - // For more downscaling better to use more precise version - if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { - _dispatcher = convolve_vertical_neon_i16_precision; - } else { - _dispatcher = convolve_vertical_neon_i32_precision; + match _options.workload_strategy { + crate::WorkloadStrategy::PreferQuality => { + use crate::neon::convolve_vertical_neon_i32_precision_d; + _dispatcher = convolve_vertical_neon_i32_precision_d; + } + crate::WorkloadStrategy::PreferSpeed => { + // For more downscaling better to use more precise version + #[cfg(feature = "rdm")] + if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() { + use crate::neon::convolve_vertical_neon_i16_precision; + _dispatcher = convolve_vertical_neon_i16_precision; + } else { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + #[cfg(not(feature = "rdm"))] + { + use crate::neon::convolve_vertical_neon_i32_precision; + _dispatcher = convolve_vertical_neon_i32_precision; + } + } } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if std::arch::is_x86_feature_detected!("sse4.1") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_sse_row_lp; } else { _dispatcher = convolve_vertical_sse_row; } } if std::arch::is_x86_feature_detected!("avx2") { - if _scale_factor < 8. { + if _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { _dispatcher = convolve_vertical_avx_row_lp; } else { _dispatcher = convolve_vertical_avx_row; } } #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avx512bw") { - if _scale_factor < 8. { - use crate::avx512::convolve_vertical_avx512_row_lp; - _dispatcher = convolve_vertical_avx512_row_lp; - } + if std::arch::is_x86_feature_detected!("avx512bw") + && _scale_factor < 8. + && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed + { + use crate::avx512::convolve_vertical_avx512_row_lp; + _dispatcher = convolve_vertical_avx512_row_lp; } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] diff --git a/src/scaler.rs b/src/scaler.rs index ea0523b..1b4d94f 100644 --- a/src/scaler.rs +++ b/src/scaler.rs @@ -27,7 +27,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::ar30::{Ar30ByteOrder, Rgb30}; -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; +use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::image_size::ImageSize; use crate::image_store::{ @@ -44,7 +44,7 @@ use crate::{ Rgb16ImageStore, Rgb8ImageStore, RgbF32ImageStore, Rgba16ImageStore, Rgba8ImageStore, RgbaF32ImageStore, }; -use num_traits::{AsPrimitive, Float, FromPrimitive, Signed}; +use num_traits::{AsPrimitive, Float, Signed}; use rayon::ThreadPool; use std::fmt::Debug; use std::ops::{AddAssign, MulAssign, Neg}; @@ -54,6 +54,7 @@ use std::ops::{AddAssign, MulAssign, Neg}; pub struct Scaler { pub(crate) function: ResamplingFunction, pub(crate) threading_policy: ThreadingPolicy, + pub workload_strategy: WorkloadStrategy, } pub trait Scaling { @@ -196,6 +197,15 @@ pub trait ScalingF32 { ) -> Result<(), PicScaleError>; } +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Default)] +pub enum WorkloadStrategy { + /// Prefers quality to speed + PreferQuality, + /// Prefers speed to quality + #[default] + PreferSpeed, +} + pub trait ScalingU16 { /// Performs rescaling for Planar u16 /// @@ -332,9 +342,14 @@ impl Scaler { Scaler { function: filter, threading_policy: ThreadingPolicy::Single, + workload_strategy: WorkloadStrategy::default(), } } + pub fn set_workload_strategy(&mut self, workload_strategy: WorkloadStrategy) { + self.workload_strategy = workload_strategy; + } + pub(crate) fn generate_weights(&self, in_size: usize, out_size: usize) -> FilterWeights where T: Copy @@ -555,7 +570,7 @@ impl Scaler { impl Scaler { pub(crate) fn generic_resize< 'a, - T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + T: Clone + Copy + Debug + Send + Sync + Default + 'static, const N: usize, >( &self, @@ -621,7 +636,8 @@ impl Scaler { )?; new_image_vertical.bit_depth = into.bit_depth; let vertical_filters = self.generate_weights(store.height, new_size.height); - store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); + let options = ConvolutionOptions::new(self.workload_strategy); + store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool, options); let new_immutable_store = ImageStore:: { buffer: std::borrow::Cow::Owned(target_vertical), @@ -632,23 +648,26 @@ impl Scaler { bit_depth: into.bit_depth, }; let horizontal_filters = self.generate_weights(store.width, new_size.width); - new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool); + let options = ConvolutionOptions::new(self.workload_strategy); + new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool, options); Ok(()) } else if should_do_vertical { let vertical_filters = self.generate_weights(store.height, new_size.height); - store.convolve_vertical(vertical_filters, into, &pool); + let options = ConvolutionOptions::new(self.workload_strategy); + store.convolve_vertical(vertical_filters, into, &pool, options); Ok(()) } else { assert!(should_do_horizontal); let horizontal_filters = self.generate_weights(store.width, new_size.width); - store.convolve_horizontal(horizontal_filters, into, &pool); + let options = ConvolutionOptions::new(self.workload_strategy); + store.convolve_horizontal(horizontal_filters, into, &pool, options); Ok(()) } } fn forward_resize_with_alpha< 'a, - T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + T: Clone + Copy + Debug + Send + Sync + Default + 'static, const N: usize, >( &self, @@ -702,7 +721,8 @@ impl Scaler { )?; new_image_vertical.bit_depth = into.bit_depth; let vertical_filters = self.generate_weights(src_store.height, new_size.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool); + let options = ConvolutionOptions::new(self.workload_strategy); + src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool, options); let new_immutable_store = ImageStore:: { buffer: std::borrow::Cow::Owned(target_vertical), @@ -713,7 +733,8 @@ impl Scaler { bit_depth: into.bit_depth, }; let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - new_immutable_store.convolve_horizontal(horizontal_filters, into, pool); + let options = ConvolutionOptions::new(self.workload_strategy); + new_immutable_store.convolve_horizontal(horizontal_filters, into, pool, options); if premultiply_alpha_requested && has_alpha_premultiplied { into.unpremultiply_alpha(pool); @@ -724,7 +745,7 @@ impl Scaler { fn forward_resize_vertical_with_alpha< 'a, - T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + T: Clone + Copy + Debug + Send + Sync + Default + 'static, const N: usize, >( &self, @@ -769,7 +790,8 @@ impl Scaler { } let vertical_filters = self.generate_weights(src_store.height, new_size.height); - src_store.convolve_vertical(vertical_filters, into, pool); + let options = ConvolutionOptions::new(self.workload_strategy); + src_store.convolve_vertical(vertical_filters, into, pool, options); if premultiply_alpha_requested && has_alpha_premultiplied { into.unpremultiply_alpha(pool); @@ -780,7 +802,7 @@ impl Scaler { fn forward_resize_horizontal_with_alpha< 'a, - T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + T: Clone + Copy + Debug + Send + Sync + Default + 'static, const N: usize, >( &self, @@ -825,7 +847,8 @@ impl Scaler { } let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - src_store.convolve_horizontal(horizontal_filters, into, pool); + let options = ConvolutionOptions::new(self.workload_strategy); + src_store.convolve_horizontal(horizontal_filters, into, pool, options); if premultiply_alpha_requested && has_alpha_premultiplied { into.unpremultiply_alpha(pool); @@ -836,7 +859,7 @@ impl Scaler { pub(crate) fn generic_resize_with_alpha< 'a, - T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + T: Clone + Copy + Debug + Send + Sync + Default + 'static, const N: usize, >( &self, @@ -1097,6 +1120,8 @@ impl ScalingU16 for Scaler { impl Scaler { /// Resizes RGBA2101010 image /// + /// This method ignores alpha scaling. + /// /// # Arguments /// `src` - source slice /// `src_size` - Source Image size @@ -1105,26 +1130,32 @@ impl Scaler { /// pub fn resize_ar30( &self, - src: &[u32], + src: &[u8], + src_stride: usize, src_size: ImageSize, - dst: &mut [u32], + dst: &mut [u8], + dst_stride: usize, new_size: ImageSize, order: Ar30ByteOrder, ) -> Result<(), PicScaleError> { match order { - Ar30ByteOrder::Host => resize_ar30_impl::< - { Rgb30::Ar30 as usize }, - { Ar30ByteOrder::Host as usize }, - >(src, src_size, dst, new_size, self), - Ar30ByteOrder::Network => resize_ar30_impl::< - { Rgb30::Ar30 as usize }, - { Ar30ByteOrder::Network as usize }, - >(src, src_size, dst, new_size, self), + Ar30ByteOrder::Host => { + resize_ar30_impl::<{ Rgb30::Ar30 as usize }, { Ar30ByteOrder::Host as usize }>( + src, src_stride, src_size, dst, dst_stride, new_size, self, + ) + } + Ar30ByteOrder::Network => { + resize_ar30_impl::<{ Rgb30::Ar30 as usize }, { Ar30ByteOrder::Network as usize }>( + src, src_stride, src_size, dst, dst_stride, new_size, self, + ) + } } } /// Resizes RGBA1010102 image /// + /// This method ignores alpha scaling. + /// /// # Arguments /// `src` - source slice /// `src_size` - Source Image size @@ -1133,21 +1164,25 @@ impl Scaler { /// pub fn resize_ra30( &self, - src: &[u32], + src: &[u8], + src_stride: usize, src_size: ImageSize, - dst: &mut [u32], + dst: &mut [u8], + dst_stride: usize, new_size: ImageSize, order: Ar30ByteOrder, ) -> Result<(), PicScaleError> { match order { - Ar30ByteOrder::Host => resize_ar30_impl::< - { Rgb30::Ra30 as usize }, - { Ar30ByteOrder::Host as usize }, - >(src, src_size, dst, new_size, self), - Ar30ByteOrder::Network => resize_ar30_impl::< - { Rgb30::Ra30 as usize }, - { Ar30ByteOrder::Network as usize }, - >(src, src_size, dst, new_size, self), + Ar30ByteOrder::Host => { + resize_ar30_impl::<{ Rgb30::Ra30 as usize }, { Ar30ByteOrder::Host as usize }>( + src, src_stride, src_size, dst, dst_stride, new_size, self, + ) + } + Ar30ByteOrder::Network => { + resize_ar30_impl::<{ Rgb30::Ra30 as usize }, { Ar30ByteOrder::Network as usize }>( + src, src_stride, src_size, dst, dst_stride, new_size, self, + ) + } } } } @@ -1162,7 +1197,7 @@ pub struct ScalingOptions { pub trait ImageStoreScaling<'b, T, const N: usize> where - T: FromPrimitive + Clone + Copy + Debug, + T: Clone + Copy + Debug, { fn scale( &self, diff --git a/src/scaler_f16.rs b/src/scaler_f16.rs index 8c6d1f1..eafe342 100644 --- a/src/scaler_f16.rs +++ b/src/scaler_f16.rs @@ -34,7 +34,7 @@ use crate::{ CbCrF16ImageStore, ImageStore, ImageStoreScaling, PlanarF16ImageStore, RgbF16ImageStore, RgbaF16ImageStore, Scaler, Scaling, ThreadingPolicy, }; -use half::f16; +use core::f16; /// Implements `f16` type support impl Scaler { @@ -56,7 +56,7 @@ impl Scaler { /// use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; /// let mut scaler = Scaler::new(ResamplingFunction::Bilinear); /// let src_store = ImageStore::alloc(100, 100); - /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); + /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); /// scaler.resize_rgba_f16(&src_store, &mut dst_store, false).unwrap(); /// ``` pub fn resize_rgba_f16<'a>( @@ -84,7 +84,7 @@ impl Scaler { /// use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; /// let mut scaler = Scaler::new(ResamplingFunction::Bilinear); /// let src_store = ImageStore::alloc(100, 100); - /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); + /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); /// scaler.resize_rgb_f16(&src_store, &mut dst_store).unwrap(); /// ``` pub fn resize_rgb_f16<'a>( @@ -111,7 +111,7 @@ impl Scaler { /// use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; /// let mut scaler = Scaler::new(ResamplingFunction::Bilinear); /// let src_store = ImageStore::alloc(100, 100); - /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); + /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); /// scaler.resize_cbcr_f16(&src_store, &mut dst_store).unwrap(); /// ``` pub fn resize_cbcr_f16<'a>( @@ -138,7 +138,7 @@ impl Scaler { /// use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; /// let mut scaler = Scaler::new(ResamplingFunction::Bilinear); /// let src_store = ImageStore::alloc(100, 100); - /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); + /// let mut dst_store = ImageStoreMut::::alloc_with_depth(50, 50, 10); /// scaler.resize_plane_f16(&src_store, &mut dst_store).unwrap(); /// ``` /// diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs index a8ff24d..e0c6195 100644 --- a/src/sse/alpha_f16.rs +++ b/src/sse/alpha_f16.rs @@ -39,9 +39,9 @@ use std::arch::x86::*; use std::arch::x86_64::*; pub(crate) fn sse_premultiply_alpha_rgba_f16( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, height: usize, @@ -60,9 +60,9 @@ pub(crate) fn sse_premultiply_alpha_rgba_f16( #[target_feature(enable = "sse4.1")] unsafe fn sse_premultiply_alpha_rgba_f16_regular( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, height: usize, @@ -75,9 +75,9 @@ unsafe fn sse_premultiply_alpha_rgba_f16_regular( #[target_feature(enable = "sse4.1", enable = "f16c")] unsafe fn sse_premultiply_alpha_rgba_f16c( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, height: usize, @@ -89,10 +89,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16c( } #[inline(always)] -unsafe fn sse_premultiply_alpha_rgba_row_f16_impl( - dst: &mut [half::f16], - src: &[half::f16], -) { +unsafe fn sse_premultiply_alpha_rgba_row_f16_impl(dst: &mut [f16], src: &[f16]) { let mut rem = dst; let mut src_rem = src; @@ -145,9 +142,9 @@ unsafe fn sse_premultiply_alpha_rgba_row_f16_impl( #[inline(always)] unsafe fn sse_premultiply_alpha_rgba_f16_impl( - dst: &mut [half::f16], + dst: &mut [f16], dst_stride: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, width: usize, _: usize, @@ -177,7 +174,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16_impl( } pub(crate) fn sse_unpremultiply_alpha_rgba_f16( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, height: usize, @@ -194,7 +191,7 @@ pub(crate) fn sse_unpremultiply_alpha_rgba_f16( #[target_feature(enable = "sse4.1")] unsafe fn sse_unpremultiply_alpha_rgba_f16_regular( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, height: usize, @@ -205,7 +202,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16_regular( #[target_feature(enable = "sse4.1", enable = "f16c")] unsafe fn sse_unpremultiply_alpha_rgba_f16c( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, height: usize, @@ -215,7 +212,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c( } #[inline(always)] -unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) { +unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [f16]) { let mut rem = in_place; for dst in rem.chunks_exact_mut(8 * 4) { @@ -293,7 +290,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl(in_place: #[inline(always)] unsafe fn sse_unpremultiply_alpha_rgba_f16_impl( - in_place: &mut [half::f16], + in_place: &mut [f16], stride: usize, width: usize, _: usize, diff --git a/src/sse/f16_utils.rs b/src/sse/f16_utils.rs index 7f7a8e1..f0e78e2 100644 --- a/src/sse/f16_utils.rs +++ b/src/sse/f16_utils.rs @@ -211,7 +211,7 @@ pub(crate) unsafe fn _mm_cvtph_psx(x: __m128i) -> __m128 { #[cfg(test)] mod tests { use super::*; - use half::f16; + use f16; #[test] fn test_conversion_into_f16() { diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 5f7a083..1ef68b2 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -27,23 +27,23 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod alpha_f16; mod alpha_f32; mod alpha_u16; mod alpha_u8; mod cbcr8_hrs; mod check_alpha; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod f16_utils; mod plane_f32; mod plane_u8; mod plane_u8_hrs; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod rgb_f16; mod rgb_f32; mod rgb_u8; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod rgba_f16; mod rgba_f32; mod rgba_u16; @@ -53,7 +53,7 @@ mod rgba_u8_lb; mod routines; mod u8_utils; mod utils; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] mod vertical_f16; mod vertical_f32; mod vertical_u16; @@ -61,7 +61,7 @@ mod vertical_u16_lb; mod vertical_u8; mod vertical_u8_lp; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16}; pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32; pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32; @@ -81,7 +81,7 @@ pub(crate) use plane_u8::{ pub(crate) use plane_u8_hrs::{ convolve_horizontal_plane_sse_row_hrs, convolve_horizontal_plane_sse_rows_hrs_4_u8, }; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgb_f16::{ convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16, }; @@ -89,7 +89,7 @@ pub(crate) use rgb_f32::{ convolve_horizontal_rgb_sse_row_one_f32, convolve_horizontal_rgb_sse_rows_4_f32, }; pub(crate) use rgb_u8::*; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use rgba_f16::{ convolve_horizontal_rgba_sse_row_one_f16, convolve_horizontal_rgba_sse_rows_4_f16, }; @@ -111,7 +111,7 @@ pub(crate) use rgba_u8_lb::{ pub(crate) use routines::{load_4_weights, load_4_weights_group_2_avx, load_8_weights_group_4_avx}; pub(crate) use u8_utils::*; pub(crate) use utils::*; -#[cfg(feature = "half")] +#[cfg(feature = "nightly_f16")] pub(crate) use vertical_f16::convolve_vertical_sse_row_f16; pub(crate) use vertical_f32::convolve_vertical_rgb_sse_row_f32; pub(crate) use vertical_u16::convolve_column_sse_u16; diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs index 637b774..66335b6 100644 --- a/src/sse/rgb_f16.rs +++ b/src/sse/rgb_f16.rs @@ -30,7 +30,7 @@ use crate::filter_weights::FilterWeights; use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx}; use crate::sse::{_mm_prefer_fma_ps, load_4_weights, shuffle}; -use half::f16; +use core::f16; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs index 5e367d2..0f234b1 100644 --- a/src/sse/rgba_f16.rs +++ b/src/sse/rgba_f16.rs @@ -32,7 +32,7 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use half::f16; +use core::f16; use crate::filter_weights::FilterWeights; use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx}; diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs index dfa8782..9807205 100644 --- a/src/sse/vertical_f16.rs +++ b/src/sse/vertical_f16.rs @@ -26,6 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use core::f16; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -39,9 +40,9 @@ use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx}; pub(crate) unsafe fn convolve_vertical_part_sse_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -71,9 +72,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -102,9 +103,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -155,9 +156,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16( start_y: usize, start_x: usize, - src: &[half::f16], + src: &[f16], src_stride: usize, - dst: &mut [half::f16], + dst: &mut [f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -192,8 +193,8 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -217,8 +218,8 @@ pub(crate) fn convolve_vertical_sse_row_f16( unsafe fn convolve_vertical_sse_row_f16_regular( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -234,8 +235,8 @@ unsafe fn convolve_vertical_sse_row_f16_regular( unsafe fn convolve_vertical_sse_row_f16c_fma( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -251,8 +252,8 @@ unsafe fn convolve_vertical_sse_row_f16c_fma( unsafe fn convolve_vertical_sse_row_f16c( width: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -265,8 +266,8 @@ unsafe fn convolve_vertical_sse_row_f16c( unsafe fn convolve_vertical_sse_row_f16_impl( _: usize, bounds: &FilterBounds, - src: &[half::f16], - dst: &mut [half::f16], + src: &[f16], + dst: &mut [f16], src_stride: usize, weight_ptr: &[f32], ) {