Skip to content

Commit

Permalink
Thread improvements, a lot of reworking
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 4, 2024
1 parent adc785d commit 90d6e5b
Show file tree
Hide file tree
Showing 14 changed files with 233 additions and 130 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,41 @@ Example comparison time for downscale RGB 4928x3279 image in two times for x86_6
| pic-scale | 26.13 |
| fir sse | 26.84 |

M3 Pro. NEON

| | Lanczos3 |
|-----------|:--------:|
| pic-scale | 23.04 |
| fir sse | 37.00 |

Example comparison time for downscale RGBA 4928x3279 image in two times for x86_64 SSE with premultiplying alpha.

| | Lanczos3 |
|-----------|:--------:|
| pic-scale | 42.35 |
| fir sse | 42.96 |

M3 Pro. NEON

| | Lanczos3 |
|-----------|:--------:|
| pic-scale | 47.45 |
| fir sse | 53.81 |

Example comparison time for downscale RGBA 4928x3279 image in two times for x86_64 SSE without premultiplying alpha.

| | Lanczos3 |
|-----------|:--------:|
| pic-scale | 26.92 |
| fir sse | 38.30 |

M3 Pro. NEON

| | Lanczos3 |
|-----------|:--------:|
| pic-scale | 38.75 |
| fir sse | 45.79 |

#### Example integration with `image` crate

```rust
Expand Down
26 changes: 16 additions & 10 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ use fast_image_resize::{
use image::io::Reader as ImageReader;
use image::{EncodableLayout, GenericImageView};

use pic_scale::{ImageSize, ImageStore, LabScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
use pic_scale::{
ImageSize, ImageStore, LabScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling,
ThreadingPolicy,
};

fn main() {
test_fast_image();
// test_fast_image();

let img = ImageReader::open("./assets/nasa-4928x3279.png")
let img = ImageReader::open("./assets/asset_5.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -22,13 +25,14 @@ fn main() {

let start_time = Instant::now();

let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);
let mut scaler = LabScaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Adaptive);
let store =
ImageStore::<u8, 3>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
let resized = scaler.resize_rgb(
ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
let resized = scaler.resize_rgba(
ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
store,
false,
);

let elapsed_time = start_time.elapsed();
Expand Down Expand Up @@ -57,7 +61,7 @@ fn main() {
}

fn test_fast_image() {
let img = ImageReader::open("./assets/nasa-4928x3279.png")
let img = ImageReader::open("./assets/asset_5.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -67,7 +71,7 @@ fn test_fast_image() {

let start_time = Instant::now();

let pixel_type: PixelType = PixelType::U8x3;
let pixel_type: PixelType = PixelType::U8x4;

let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();

Expand All @@ -86,7 +90,9 @@ fn test_fast_image() {
.resize(
&src_image,
&mut dst_image,
&ResizeOptions::new().resize_alg(ResizeAlg::Convolution(Lanczos3)).use_alpha(true),
&ResizeOptions::new()
.resize_alg(ResizeAlg::Convolution(Lanczos3))
.use_alpha(false),
)
.unwrap();

Expand Down
10 changes: 8 additions & 2 deletions src/alpha_handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ pub unsafe fn neon_umpremultiply_alpha(v: uint8x16_t, a_values: uint8x16_t) -> u
let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho));
let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi));
let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi));
vbslq_u8(zero_mask, vdupq_n_u8(0), vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)))
vbslq_u8(
zero_mask,
vdupq_n_u8(0),
vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)),
)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Expand Down Expand Up @@ -189,7 +193,9 @@ pub fn premultiply_alpha_rgba(dst: &mut [u8], src: &[u8], width: usize, height:
let px = _cx * 4;
let src_ptr = src.as_ptr().add(offset + px);
let mut pixel = vld4q_u8(src_ptr);

pixel.0 = neon_premultiply_alpha(pixel.0, pixel.3);
pixel.1 = neon_premultiply_alpha(pixel.1, pixel.3);
pixel.2 = neon_premultiply_alpha(pixel.2, pixel.3);
let dst_ptr = dst.as_mut_ptr().add(offset + px);
vst4q_u8(dst_ptr, pixel);
_cx += 16;
Expand Down
5 changes: 3 additions & 2 deletions src/convolve_u8.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::filter_weights::FilterBounds;
use crate::support::{PRECISION, ROUNDING_APPROX};

#[inline(always)]
#[allow(unused)]
Expand All @@ -11,7 +12,7 @@ pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: u
filter: *const i16,
bounds: &FilterBounds,
) {
let mut store: [[i32; CHANNELS]; PART] = [[0; CHANNELS]; PART];
let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART];

for j in 0..bounds.size {
let py = start_y + j;
Expand All @@ -33,7 +34,7 @@ pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: u
let dst_ptr = dst.add(px);
for c in 0..CHANNELS {
let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
let ck = vl >> 12;
let ck = vl >> PRECISION;
*dst_ptr.add(c) = ck.max(0).min(255) as u8;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ mod luv_scaler;
mod sse_rgb_f32;
mod alpha_handle;
mod sse_utils;
mod support;

pub use image_size::ImageSize;
pub use image_store::ImageStore;
Expand Down
12 changes: 7 additions & 5 deletions src/neon_rgb_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub mod neon_rgb {
use crate::filter_weights::{FilterBounds, FilterWeights};
use crate::neon_simd_u8::neon_convolve_u8;
use std::arch::aarch64::*;
use crate::support::ROUNDING_APPROX;

pub unsafe fn convolve_horizontal_rgb_neon_rows_4(
dst_width: usize,
Expand All @@ -23,13 +24,14 @@ pub mod neon_rgb {
let weights_ptr = approx_weights.weights.as_ptr();
const CHANNELS: usize = 3;
let zeros = vdupq_n_s32(0i32);
let init = vdupq_n_s32(ROUNDING_APPROX);
for x in 0..dst_width {
let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
let mut jx = 0usize;
let mut store_0 = zeros;
let mut store_1 = zeros;
let mut store_2 = zeros;
let mut store_3 = zeros;
let mut store_0 = init;
let mut store_1 = init;
let mut store_2 = init;
let mut store_3 = init;

while jx + 4 < bounds.size && x + 6 < src_width {
let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
Expand Down Expand Up @@ -235,7 +237,7 @@ pub mod neon_rgb {
for x in 0..dst_width {
let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
let mut jx = 0usize;
let mut store = zeros;
let mut store = vdupq_n_s32(ROUNDING_APPROX);

while jx + 4 < bounds.size && x + 6 < src_width {
let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
Expand Down
32 changes: 18 additions & 14 deletions src/neon_simd_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
pub mod neon_convolve_u8 {
use crate::filter_weights::FilterBounds;
use std::arch::aarch64::*;
use crate::support::ROUNDING_APPROX;

#[inline(always)]
pub(crate) unsafe fn convolve_horizontal_parts_one_rgba(
Expand Down Expand Up @@ -102,14 +103,15 @@ pub mod neon_convolve_u8 {
filter: *const i16,
bounds: &FilterBounds,
) {
let mut store_0 = vdupq_n_s32(0i32);
let mut store_1 = vdupq_n_s32(0i32);
let mut store_2 = vdupq_n_s32(0i32);
let mut store_3 = vdupq_n_s32(0i32);
let mut store_4 = vdupq_n_s32(0i32);
let mut store_5 = vdupq_n_s32(0i32);
let mut store_6 = vdupq_n_s32(0i32);
let mut store_7 = vdupq_n_s32(0i32);
let vld = vdupq_n_s32(ROUNDING_APPROX);
let mut store_0 = vld;
let mut store_1 = vld;
let mut store_2 = vld;
let mut store_3 = vld;
let mut store_4 = vld;
let mut store_5 = vld;
let mut store_6 = vld;
let mut store_7 = vld;

let px = start_x;

Expand Down Expand Up @@ -178,10 +180,11 @@ pub mod neon_convolve_u8 {
filter: *const i16,
bounds: &FilterBounds,
) {
let mut store_0 = vdupq_n_s32(0i32);
let mut store_1 = vdupq_n_s32(0i32);
let mut store_2 = vdupq_n_s32(0i32);
let mut store_3 = vdupq_n_s32(0i32);
let vld = vdupq_n_s32(ROUNDING_APPROX);
let mut store_0 = vld;
let mut store_1 = vld;
let mut store_2 = vld;
let mut store_3 = vld;

let px = start_x;

Expand Down Expand Up @@ -230,8 +233,9 @@ pub mod neon_convolve_u8 {
bounds: &FilterBounds,
blend_length: usize,
) {
let mut store_0 = vdupq_n_s32(0i32);
let mut store_1 = vdupq_n_s32(0i32);
let vld = vdupq_n_s32(ROUNDING_APPROX);
let mut store_0 = vld;
let mut store_1 = vld;

let px = start_x;

Expand Down
7 changes: 4 additions & 3 deletions src/rgb_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::image_store::ImageStore;
use crate::neon_rgb_u8::neon_rgb::*;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::sse_rgb_u8::sse_rgb::*;
use crate::support::ROUNDING_APPROX;
use crate::unsafe_slice::UnsafeSlice;

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
Expand Down Expand Up @@ -221,9 +222,9 @@ fn convolve_horizontal_rgb_native_row(
let mut filter_offset = 0usize;
let weights_ptr = filter_weights.weights.as_ptr();
for x in 0..dst_width {
let mut sum_r = 0i32;
let mut sum_g = 0i32;
let mut sum_b = 0i32;
let mut sum_r = ROUNDING_APPROX;
let mut sum_g = ROUNDING_APPROX;
let mut sum_b = ROUNDING_APPROX;

let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
let start_x = bounds.start;
Expand Down
Loading

0 comments on commit 90d6e5b

Please sign in to comment.