Skip to content

Commit

Permalink
Merge pull request #14 from awxkee/dev
Browse files Browse the repository at this point in the history
Added RGBA8 AVX2 fast path
  • Loading branch information
awxkee authored Dec 29, 2024
2 parents ab25ef6 + 3f494d3 commit ae8436e
Show file tree
Hide file tree
Showing 39 changed files with 799 additions and 151 deletions.
39 changes: 19 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,37 +43,36 @@ Despite all implementation are fast, not all the paths are implemented using SIM

`~` - Partially implemented

| | NEON | SSE | AVX | WASM |
|----------------|------|-----|-----|------|
| RGBA (8 bit) | x | x | ~ | ~ |
| RGB (8 bit) | x | x | ~ | ~ |
| Plane (8 bit) | x | x | ~ | ~ |
| RGBA (8+ bit) | x | x | ~ | - |
| RGB (8+ bit) | x | x | ~ | - |
| Plane (8+ bit) | ~ | ~ | ~ | - |
| RGBA (f32) | x | x | x | - |
| RGB (f32) | x | x | ~ | - |
| Plane (f32) | x | x | ~ | - |
| RGBA (f16) | x | x | x | - |
| RGB (f16) | x | ~ | ~ | - |
| Plane (f16) | ~ | ~ | ~ | - |
| AR30/RA30 | x | - | - | - |
| | NEON | SSE | AVX2 | WASM |
|----------------|------|-----|------|------|
| RGBA (8 bit) | x | x | x | ~ |
| RGB (8 bit) | x | x | ~ | ~ |
| Plane (8 bit) | x | x | ~ | ~ |
| RGBA (8+ bit) | x | x | ~ | - |
| RGB (8+ bit) | x | x | ~ | - |
| Plane (8+ bit) | ~ | ~ | ~ | - |
| RGBA (f32) | x | x | x | - |
| RGB (f32) | x | x | ~ | - |
| Plane (f32) | x | x | ~ | - |
| RGBA (f16) | x | x | x | - |
| RGB (f16) | x | ~ | ~ | - |
| Plane (f16) | ~ | ~ | ~ | - |
| AR30/RA30 | x | - | - | - |

#### Features

For RISC-V `riscv` feature should be implicitly enabled, nightly compiler channel is required

To enable support of `f16` the feature `half` should be activated.

#### Target features

`neon` optional target features are available, enable it when compiling on supported platform to get full features
`neon` optional target features are available, enable it when compiling on supported platform to get full features.

`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, and called the best path
`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, and called the best path.
For x86 and aarch64 NEON runtime dispatch is used.

`fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM.

WASM `simd128` target feature activating is mandatory in build flags
WASM `simd128` target feature activating is mandatory in build flags.

##### About f16

Expand Down
37 changes: 19 additions & 18 deletions app/benches/resize_rgb/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let src_bytes = binding.as_bytes();

c.bench_function("Pic scale RGB: Lanczos 3", |b| {
let mut copied: Vec<u8> = Vec::from(src_bytes);
let store = ImageStore::<u8, 3>::from_slice(
&mut copied,
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
b.iter(|| {
let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);
let mut copied: Vec<u8> = Vec::from(src_bytes);
let store = ImageStore::<u8, 3>::from_slice(
&mut copied,
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
let mut target =
ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
scaler.resize_rgb(&store, &mut target).unwrap();
Expand All @@ -36,28 +36,29 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let f32_image: Vec<f32> = src_bytes.iter().map(|&x| x as f32 / 255f32).collect();

c.bench_function("Pic scale RGB f32: Lanczos 3", |b| {
let mut copied: Vec<f32> = Vec::from(f32_image.clone());
let store = ImageStore::<f32, 3>::from_slice(
&mut copied,
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
b.iter(|| {
let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);
let mut copied: Vec<f32> = Vec::from(f32_image.clone());
let store = ImageStore::<f32, 3>::from_slice(
&mut copied,
dimensions.0 as usize,
dimensions.1 as usize,
)
.unwrap();
let mut target =
ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
scaler.resize_rgb_f32(&store, &mut target).unwrap();
})
});

c.bench_function("Fast image resize RGB: Lanczos 3", |b| {
let mut vc = Vec::from(img.as_bytes());
let pixel_type: PixelType = PixelType::U8x3;

let src_image =
Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
b.iter(|| {
let mut vc = Vec::from(img.as_bytes());
let pixel_type: PixelType = PixelType::U8x3;
let src_image =
Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
let mut dst_image = Image::new(dimensions.0 / 4, dimensions.1 / 4, pixel_type);

let mut resizer = Resizer::new();
Expand Down
6 changes: 3 additions & 3 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ fn main() {
let transient = img.to_rgba8();
let mut bytes = Vec::from(transient.as_bytes());

let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear);
let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);

// resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
Expand All @@ -64,7 +64,7 @@ fn main() {
ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
.unwrap();

let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
// let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
let start_time = Instant::now();
// scaler
Expand All @@ -78,7 +78,7 @@ fn main() {
// .unwrap();

let mut dst_store =
ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 3, dimensions.1 as usize / 3);

scaler.resize_rgba(&store, &mut dst_store, false).unwrap();

Expand Down
2 changes: 2 additions & 0 deletions src/alpha_check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub(crate) fn has_non_constant_cap_alpha_rgba_f32(store: &[f32], width: usize) -
has_non_constant_cap_alpha_f32_impl::<3, 4>(store, width)
}

/// Scans an image to check if alpha is not constant
pub(crate) fn has_non_constant_cap_alpha<
V: Copy + PartialEq + BitXor<V, Output = V> + 'static + AsPrimitive<J> + 'static,
J: Copy + AddAssign + Default + 'static + Eq + Ord,
Expand Down Expand Up @@ -76,6 +77,7 @@ where
row_sums.ne(&zeros)
}

/// Scans an `f32` image to check if alpha is not constant
fn has_non_constant_cap_alpha_f32_impl<const ALPHA_CHANNEL_INDEX: usize, const CHANNELS: usize>(
store: &[f32],
width: usize,
Expand Down
11 changes: 7 additions & 4 deletions src/alpha_handle_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,23 @@ use rayon::slice::ParallelSliceMut;
use rayon::ThreadPool;

#[inline]
/// Divides value by 1023 with rounding to nearest
pub(crate) fn div_by_1023(v: u32) -> u16 {
let round = 1 << 9;
let v = v + round;
(((v >> 10) + v) >> 10) as u16
}

#[inline]
/// Divides value by 4095 with rounding to nearest
pub(crate) fn div_by_4095(v: u32) -> u16 {
let round = 1 << 11;
let v = v + round;
(((v >> 12) + v) >> 12) as u16
}

#[inline]
/// Divides value by 655353 with rounding to nearest
pub(crate) fn div_by_65535(v: u32) -> u16 {
let round = 1 << 15;
let v_expand = v;
Expand Down Expand Up @@ -174,13 +177,13 @@ pub(crate) fn premultiply_alpha_rgba_u16(
premultiply_alpha_rgba_impl;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("sse4.1") {
if std::is_x86_feature_detected!("sse4.1") {
_dispatcher = premultiply_alpha_sse_rgba_u16;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("avx2") {
if std::is_x86_feature_detected!("avx2") {
_dispatcher = avx_premultiply_alpha_rgba_u16;
}
}
Expand All @@ -203,13 +206,13 @@ pub(crate) fn unpremultiply_alpha_rgba_u16(
unpremultiply_alpha_rgba_impl;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("sse4.1") {
if std::is_x86_feature_detected!("sse4.1") {
_dispatcher = unpremultiply_alpha_sse_rgba_u16;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("avx2") {
if std::is_x86_feature_detected!("avx2") {
_dispatcher = avx_unpremultiply_alpha_rgba_u16;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/alpha_handle_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use rayon::slice::ParallelSliceMut;
use rayon::ThreadPool;

#[inline]
/// Divides value by 255 with rounding to nearest
pub(crate) fn div_by_255(v: u16) -> u8 {
((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
}
Expand Down
4 changes: 2 additions & 2 deletions src/ar30.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ impl Rgb30 {
#[inline]
pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
let value: u32 = match self {
Rgb30::Ar30 => (a << 30 | (b << 20) | (g << 10) | r) as u32,
Rgb30::Ra30 => ((r << 22) | (g << 12) | (b << 2) | a) as u32,
Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32,
Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32,
};
if STORE == 0 {
value
Expand Down
6 changes: 4 additions & 2 deletions src/avx2/alpha_f16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_f16(
}

#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[half::f16]) {
let mut rem = dst;
let mut src_rem = src;
Expand Down Expand Up @@ -112,8 +113,8 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
premultiply_pixel_f16_row(rem, src_rem);
}

#[inline]
#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_premultiply_alpha_rgba_f16_impl(
dst: &mut [half::f16],
src: &[half::f16],
Expand Down Expand Up @@ -150,6 +151,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
}

#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
let mut rem = in_place;

Expand Down Expand Up @@ -234,8 +236,8 @@ unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16])
unpremultiply_pixel_f16_row(rem);
}

#[inline]
#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_unpremultiply_alpha_rgba_f16_impl(
in_place: &mut [half::f16],
width: usize,
Expand Down
1 change: 0 additions & 1 deletion src/avx2/alpha_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_row_impl(in_place: &mut [f32]) {
unpremultiply_pixel_f32_row(rem);
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(
in_place: &mut [f32],
Expand Down
3 changes: 2 additions & 1 deletion src/avx2/alpha_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_u16(
}

#[target_feature(enable = "avx2")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
let max_colors = (1 << bit_depth) - 1;

Expand Down Expand Up @@ -356,6 +357,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_u16(
}

#[target_feature(enable = "avx2")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth: usize) {
let max_colors = (1 << bit_depth) - 1;

Expand Down Expand Up @@ -408,7 +410,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth:
unpremultiply_alpha_rgba_row(rem, max_colors);
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx_unpremultiply_alpha_rgba_u16_impl(
in_place: &mut [u16],
Expand Down
2 changes: 0 additions & 2 deletions src/avx2/alpha_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
premultiply_alpha_rgba_row_impl(rem, src_rem);
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx_premultiply_alpha_rgba_impl(
dst: &mut [u8],
Expand Down Expand Up @@ -301,7 +300,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
unpremultiply_alpha_rgba_row_impl(rem);
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx_unpremultiply_alpha_rgba_impl(
in_place: &mut [u8],
Expand Down
4 changes: 4 additions & 0 deletions src/avx2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ mod alpha_u8;
#[cfg(feature = "half")]
mod rgba_f16;
mod rgba_f32;
mod rgba_u8_lb;
pub(crate) mod utils;
#[cfg(feature = "half")]
mod vertical_f16;
Expand All @@ -56,6 +57,9 @@ pub(crate) use rgba_f16::{
pub(crate) use rgba_f32::{
convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
};
pub(crate) use rgba_u8_lb::{
convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb,
};
#[cfg(feature = "half")]
pub(crate) use vertical_f16::convolve_vertical_avx_row_f16;
pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
Expand Down
12 changes: 6 additions & 6 deletions src/avx2/rgba_f16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
}
}

#[inline]
#[target_feature(enable = "avx2,f16c,fma")]
#[target_feature(enable = "avx2", enable = "f16c", enable = "fma")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma(
dst_width: usize,
src_width: usize,
Expand All @@ -162,8 +162,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma(
);
}

#[inline]
#[target_feature(enable = "avx2,f16c")]
#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn convolve_horizontal_rgba_avx_row_one_f16_regular(
dst_width: usize,
src_width: usize,
Expand Down Expand Up @@ -301,8 +301,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
}
}

#[inline]
#[target_feature(enable = "avx2", enable = "f16c")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular(
dst_width: usize,
src_width: usize,
Expand All @@ -323,8 +323,8 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular(
);
}

#[inline]
#[target_feature(enable = "avx2", enable = "f16c", enable = "fma")]
/// This inlining is required to activate all features for runtime dispatch
unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_fma(
dst_width: usize,
src_width: usize,
Expand Down
Loading

0 comments on commit ae8436e

Please sign in to comment.