Skip to content

Commit

Permalink
Merge pull request #26 from awxkee/dev
Browse files Browse the repository at this point in the history
Stride f32, masked loads AVX-512, Vertical AVX-512 u16
  • Loading branch information
awxkee authored Jan 5, 2025
2 parents 6c172f0 + 918c682 commit 3de50d1
Show file tree
Hide file tree
Showing 16 changed files with 766 additions and 267 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ Despite all implementation are fast, not all the paths are implemented using SIM
| RGB (8 bit) | x | x | ~ | ~ | ~ |
| Plane (8 bit) | x | x | ~ | ~ | ~ |
| RGBA (8+ bit) | x | x | ~ | x(avxvnni) | - |
| RGB (8+ bit) | x | x | ~ | - | - |
| Plane (8+ bit) | ~ | ~ | ~ | - | - |
| RGB (8+ bit) | x | x | ~ | ~ | - |
| Plane (8+ bit) | ~ | ~ | ~ | ~ | - |
| RGBA (f32) | x | x | x | - | - |
| RGB (f32) | x | x | ~ | - | - |
| Plane (f32) | x | x | ~ | - | - |
Expand Down
29 changes: 20 additions & 9 deletions src/alpha_handle_f16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,24 +82,26 @@ pub(crate) fn premultiply_pixel_f16_row(dst: &mut [half::f16], src: &[half::f16]

fn premultiply_alpha_rgba_impl_f16(
dst: &mut [half::f16],
dst_stride: usize,
src: &[half::f16],
src_stride: usize,
width: usize,
_: usize,
pool: &Option<ThreadPool>,
) {
if let Some(pool) = pool {
pool.install(|| {
dst.par_chunks_exact_mut(width * 4)
.zip(src.par_chunks_exact(width * 4))
dst.par_chunks_exact_mut(dst_stride)
.zip(src.par_chunks_exact(src_stride))
.for_each(|(dst, src)| {
premultiply_pixel_f16_row(dst, src);
premultiply_pixel_f16_row(&mut dst[..width * 4], &src[..width * 4]);
});
});
} else {
dst.chunks_exact_mut(width * 4)
.zip(src.chunks_exact(width * 4))
dst.chunks_exact_mut(dst_stride)
.zip(src.chunks_exact(src_stride))
.for_each(|(dst, src)| {
premultiply_pixel_f16_row(dst, src);
premultiply_pixel_f16_row(&mut dst[..width * 4], &src[..width * 4]);
});
}
}
Expand All @@ -126,13 +128,22 @@ fn unpremultiply_alpha_rgba_impl_f16(

pub(crate) fn premultiply_alpha_rgba_f16(
dst: &mut [half::f16],
dst_stride: usize,
src: &[half::f16],
src_stride: usize,
width: usize,
height: usize,
pool: &Option<ThreadPool>,
) {
let mut _dispatcher: fn(&mut [half::f16], &[half::f16], usize, usize, &Option<ThreadPool>) =
premultiply_alpha_rgba_impl_f16;
let mut _dispatcher: fn(
&mut [half::f16],
usize,
&[half::f16],
usize,
usize,
usize,
&Option<ThreadPool>,
) = premultiply_alpha_rgba_impl_f16;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
_dispatcher = neon_premultiply_alpha_rgba_f16;
Expand All @@ -152,7 +163,7 @@ pub(crate) fn premultiply_alpha_rgba_f16(
_dispatcher = avx_premultiply_alpha_rgba_f16;
}
}
_dispatcher(dst, src, width, height, pool);
_dispatcher(dst, dst_stride, src, src_stride, width, height, pool);
}

pub(crate) fn unpremultiply_alpha_rgba_f16(
Expand Down
20 changes: 12 additions & 8 deletions src/alpha_handle_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,24 +78,26 @@ pub(crate) fn premultiply_pixel_f32_row(dst: &mut [f32], src: &[f32]) {

fn premultiply_alpha_rgba_impl_f32(
dst: &mut [f32],
dst_stride: usize,
src: &[f32],
src_stride: usize,
width: usize,
_: usize,
pool: &Option<ThreadPool>,
) {
if let Some(pool) = pool {
pool.install(|| {
dst.par_chunks_exact_mut(width * 4)
.zip(src.par_chunks_exact(width * 4))
dst.par_chunks_exact_mut(dst_stride)
.zip(src.par_chunks_exact(src_stride))
.for_each(|(dst, src)| {
premultiply_pixel_f32_row(dst, src);
premultiply_pixel_f32_row(&mut dst[..width * 4], &src[..width * 4]);
});
});
} else {
dst.chunks_exact_mut(width * 4)
.zip(src.chunks_exact(width * 4))
dst.chunks_exact_mut(dst_stride)
.zip(src.chunks_exact(src_stride))
.for_each(|(dst, src)| {
premultiply_pixel_f32_row(dst, src);
premultiply_pixel_f32_row(&mut dst[..width * 4], &src[..width * 4]);
});
}
}
Expand All @@ -122,12 +124,14 @@ fn unpremultiply_alpha_rgba_impl_f32(

pub(crate) fn premultiply_alpha_rgba_f32(
dst: &mut [f32],
dst_stride: usize,
src: &[f32],
src_stride: usize,
width: usize,
height: usize,
pool: &Option<ThreadPool>,
) {
let mut _dispatcher: fn(&mut [f32], &[f32], usize, usize, &Option<ThreadPool>) =
let mut _dispatcher: fn(&mut [f32], usize, &[f32], usize, usize, usize, &Option<ThreadPool>) =
premultiply_alpha_rgba_impl_f32;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
Expand All @@ -145,7 +149,7 @@ pub(crate) fn premultiply_alpha_rgba_f32(
_dispatcher = avx_premultiply_alpha_rgba_f32;
}
}
_dispatcher(dst, src, width, height, pool);
_dispatcher(dst, dst_stride, src, src_stride, width, height, pool);
}

pub(crate) fn unpremultiply_alpha_rgba_f32(
Expand Down
21 changes: 14 additions & 7 deletions src/avx2/alpha_f16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@ use std::arch::x86_64::*;

pub(crate) fn avx_premultiply_alpha_rgba_f16(
dst: &mut [half::f16],
dst_stride: usize,
src: &[half::f16],
src_stride: usize,
width: usize,
height: usize,
pool: &Option<ThreadPool>,
) {
unsafe {
avx_premultiply_alpha_rgba_f16_impl(dst, src, width, height, pool);
avx_premultiply_alpha_rgba_f16_impl(dst, dst_stride, src, src_stride, width, height, pool);
}
}

Expand Down Expand Up @@ -115,24 +117,29 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
/// This inlining is required to activate all features for runtime dispatch
unsafe fn avx_premultiply_alpha_rgba_f16_impl(
dst: &mut [half::f16],
dst_stride: usize,
src: &[half::f16],
src_stride: usize,
width: usize,
_: usize,
pool: &Option<ThreadPool>,
) {
if let Some(pool) = pool {
pool.install(|| {
dst.par_chunks_exact_mut(width * 4)
.zip(src.par_chunks_exact(width * 4))
dst.par_chunks_exact_mut(dst_stride)
.zip(src.par_chunks_exact(src_stride))
.for_each(|(dst, src)| unsafe {
avx_premultiply_alpha_rgba_f16_row_impl(dst, src);
avx_premultiply_alpha_rgba_f16_row_impl(
&mut dst[..width * 4],
&src[..width * 4],
);
});
});
} else {
dst.chunks_exact_mut(width * 4)
.zip(src.chunks_exact(width * 4))
dst.chunks_exact_mut(dst_stride)
.zip(src.chunks_exact(src_stride))
.for_each(|(dst, src)| unsafe {
avx_premultiply_alpha_rgba_f16_row_impl(dst, src);
avx_premultiply_alpha_rgba_f16_row_impl(&mut dst[..width * 4], &src[..width * 4]);
});
}
}
Expand Down
21 changes: 14 additions & 7 deletions src/avx2/alpha_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,15 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(

pub(crate) fn avx_premultiply_alpha_rgba_f32(
dst: &mut [f32],
dst_stride: usize,
src: &[f32],
src_stride: usize,
width: usize,
height: usize,
pool: &Option<ThreadPool>,
) {
unsafe {
avx_premultiply_alpha_rgba_f32_impl(dst, src, width, height, pool);
avx_premultiply_alpha_rgba_f32_impl(dst, dst_stride, src, src_stride, width, height, pool);
}
}

Expand Down Expand Up @@ -163,24 +165,29 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32])
#[target_feature(enable = "avx2")]
unsafe fn avx_premultiply_alpha_rgba_f32_impl(
dst: &mut [f32],
dst_stride: usize,
src: &[f32],
src_stride: usize,
width: usize,
_: usize,
pool: &Option<ThreadPool>,
) {
if let Some(pool) = pool {
pool.install(|| {
dst.par_chunks_exact_mut(width * 4)
.zip(src.par_chunks_exact(width * 4))
dst.par_chunks_exact_mut(dst_stride)
.zip(src.par_chunks_exact(src_stride))
.for_each(|(dst, src)| unsafe {
avx_premultiply_alpha_rgba_f32_row_impl(dst, src);
avx_premultiply_alpha_rgba_f32_row_impl(
&mut dst[..width * 4],
&src[..width * 4],
);
});
});
} else {
dst.chunks_exact_mut(width * 4)
.zip(src.chunks_exact(width * 4))
dst.chunks_exact_mut(dst_stride)
.zip(src.chunks_exact(src_stride))
.for_each(|(dst, src)| unsafe {
avx_premultiply_alpha_rgba_f32_row_impl(dst, src);
avx_premultiply_alpha_rgba_f32_row_impl(&mut dst[..width * 4], &src[..width * 4]);
});
}
}
Loading

0 comments on commit 3de50d1

Please sign in to comment.