Skip to content

Commit

Permalink
Merge pull request #49 from awxkee/dev
Browse files Browse the repository at this point in the history
AR30/RA30 x86
  • Loading branch information
awxkee authored Feb 2, 2025
2 parents 8d7100d + 0025a32 commit 6c0c783
Show file tree
Hide file tree
Showing 14 changed files with 858 additions and 228 deletions.
20 changes: 20 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ Despite all implementation are fast, not all the paths are implemented using SIM
| RGBA (f16) | x | x | x | - | - |
| RGB (f16) | x | ~ | ~ | - | - |
| Plane (f16) | ~ | ~ | ~ | - | - |
| AR30/RA30 | x | - | - | - | - |
| AR30/RA30 | x | x | - | - | - |

#### Features

Features:
- To enable support of `f16` the feature `half` should be activated.
- To enable support of `f16` the feature `nightly_f16` should be activated and `nightly` compiler are required.
- `nightly_avx512` activates AVX-512 feature set and requires `nightly` compiler channel

#### Target features with runtime dispatch
Expand Down
1 change: 1 addition & 0 deletions app/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ image = { version = "0.25.5", features = ["default"] }
pic-scale = { path = "..", features = ["nightly_f16"], default-features = false }
fast_image_resize = { version = "5.0.0", features = [] }
accelerate = {path = "accelerate/"}
yuvutils-rs = "0.6.0"
libc = "0.2.169"

[dev-dependencies]
Expand Down
190 changes: 86 additions & 104 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use pic_scale::{
RgbaF16ImageStore, RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy,
WorkloadStrategy,
};
use yuvutils_rs::{ar30_to_rgb8, rgb8_to_ar30, rgba8_to_ar30, Rgb30ByteOrder};

fn resize_plane(
src_width: usize,
Expand Down Expand Up @@ -59,43 +60,75 @@ fn main() {
scaler.set_threading_policy(ThreadingPolicy::Single);
scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);

// resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);

let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();

// let rgb_feature16 = transient
// .iter()
// .map(|&x| (x as f32 / 255f32) as f16)
// .collect::<Vec<_>>();

// let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
//
let mut store =
Rgba16ImageStore::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize).unwrap();
store.bit_depth = 10;

// let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
// // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
// // scaler
// // .resize_ra30(
// // &ar30_src,
// // ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
// // &mut resized_ar,
// // dst_size,
// // Ar30ByteOrder::Host,
// // )
// // .unwrap();
// // let rgb_feature16 = transient
// // .iter()
// // .map(|&x| (x as f32 / 255f32) as f16)
// // .collect::<Vec<_>>();
//
let mut dst_store = Rgba16ImageStoreMut::alloc_with_depth(
dimensions.0 as usize / 2,
dimensions.1 as usize / 2,
10,
);
// //
// let mut store =
// Rgba16ImageStore::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize).unwrap();
// store.bit_depth = 10;
//
// // for i in 0..25 {
// let start_time = Instant::now();
let mut src_ar = vec![0u8; dimensions.0 as usize * dimensions.1 as usize * 4];
rgba8_to_ar30(
&mut src_ar,
dimensions.0 * 4,
Rgb30ByteOrder::Host,
&transient,
dimensions.0 as u32 * 4,
dimensions.0,
dimensions.1,
)
.unwrap();
let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
let mut resized_ar = vec![0u8; dst_size.width * dst_size.height * 4];
scaler
.resize_rgba_u16(&store, &mut dst_store, false)
.resize_ar30(
&src_ar,
dimensions.0 as usize * 4,
ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
&mut resized_ar,
dst_size.width * 4,
dst_size,
Ar30ByteOrder::Host,
)
.unwrap();

let mut dest_rgb = vec![0u8; dst_size.width * dst_size.height * 3];
ar30_to_rgb8(
&resized_ar,
dst_size.width as u32 * 4,
Rgb30ByteOrder::Host,
&mut dest_rgb,
dst_size.width as u32 * 3,
dst_size.width as u32,
dst_size.height as u32,
)
.unwrap();

image::save_buffer(
"converted.png",
&dest_rgb,
dst_size.width as u32,
dst_size.height as u32,
image::ColorType::Rgb8,
)
.unwrap();

// let mut dst_store = Rgba16ImageStoreMut::alloc_with_depth(
// dimensions.0 as usize / 2,
// dimensions.1 as usize / 2,
// 10,
// );
// //
// // // for i in 0..25 {
// // let start_time = Instant::now();
// scaler
// .resize_rgba_u16(&store, &mut dst_store, false)
// .unwrap();
//
// let elapsed_time = start_time.elapsed();
// // Print the elapsed time in milliseconds
Expand Down Expand Up @@ -137,81 +170,30 @@ fn main() {
// // .map(|&x| (x * 255f32) as u8)
// // .collect();
//
let dst: Vec<u8> = dst_store
.as_bytes()
.iter()
.map(|&x| (x >> 2) as u8)
.collect();

// let dst = dst_store
// let dst: Vec<u8> = dst_store
// .as_bytes()
// .iter()
// .map(|&x| (x as f32 * 255.).round() as u8)
// .collect::<Vec<_>>();
// let dst = resized;
// image::save_buffer(
// "converted.png",
// &dst,
// dst_size.width as u32,
// dst_size.height as u32,
// image::ColorType::Rgba8,
// )
// .unwrap();
// .map(|&x| (x >> 2) as u8)
// .collect();

if dst_store.channels == 4 {
image::save_buffer(
"converted.png",
&dst,
dst_store.width as u32,
dst_store.height as u32,
image::ColorType::Rgba8,
)
.unwrap();
} else {
image::save_buffer(
"converted.png",
&dst,
dst_store.width as u32,
dst_store.height as u32,
image::ColorType::Rgb8,
)
.unwrap();
}

// for i in 0..37 {
// let mut scaler = Scaler::new(i.into());
// scaler.set_threading_policy(ThreadingPolicy::Adaptive);
// let store =
// ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
// let resized = scaler.resize_rgba(
// ImageSize::new(dimensions.0 as usize / 3, dimensions.1 as usize / 3),
// store,
// true,
// );
//
// let elapsed_time = start_time.elapsed();
// // Print the elapsed time in milliseconds
// println!("Scaler: {:.2?}", elapsed_time);
//
// if resized.channels == 4 {
// image::save_buffer(
// format!("converted_{}.png", i),
// resized.as_bytes(),
// resized.width as u32,
// resized.height as u32,
// image::ExtendedColorType::Rgba8,
// )
// .unwrap();
// } else {
// image::save_buffer(
// format!("converted_{}.jpg", i),
// resized.as_bytes(),
// resized.width as u32,
// resized.height as u32,
// image::ExtendedColorType::Rgb8,
// )
// .unwrap();
// }
// if dst_store.channels == 4 {
// image::save_buffer(
// "converted.png",
// &dst,
// dst_store.width as u32,
// dst_store.height as u32,
// image::ColorType::Rgba8,
// )
// .unwrap();
// } else {
// image::save_buffer(
// "converted.png",
// &dst,
// dst_store.width as u32,
// dst_store.height as u32,
// image::ColorType::Rgb8,
// )
// .unwrap();
// }
}

Expand Down
6 changes: 4 additions & 2 deletions src/avx2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ mod rgb_u8;
#[cfg(feature = "nightly_f16")]
mod rgba_f16;
mod rgba_f32;
mod rgba_u16_lb;
mod rgba_u8_lb;
pub(crate) mod utils;
#[cfg(feature = "nightly_f16")]
Expand All @@ -45,7 +46,6 @@ mod vertical_f32;
mod vertical_u16_lb;
mod vertical_u8;
mod vertical_u8_lp;
mod rgba_u16_lb;

#[cfg(feature = "nightly_f16")]
pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
Expand All @@ -65,6 +65,9 @@ pub(crate) use rgba_f16::{
pub(crate) use rgba_f32::{
convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
};
pub(crate) use rgba_u16_lb::{
convolve_horizontal_rgba_avx_rows_4_u16, convolve_horizontal_rgba_avx_u16lp_row,
};
pub(crate) use rgba_u8_lb::{
convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb,
};
Expand All @@ -74,4 +77,3 @@ pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
pub(crate) use vertical_u16_lb::convolve_column_lb_avx2_u16;
pub(crate) use vertical_u8::convolve_vertical_avx_row;
pub(crate) use vertical_u8_lp::convolve_vertical_avx_row_lp;
pub(crate) use rgba_u16_lb::{convolve_horizontal_rgba_avx_rows_4_u16, convolve_horizontal_rgba_avx_u16lp_row};
33 changes: 31 additions & 2 deletions src/dispatch_group_ar30.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
}
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
use crate::sse::sse_convolve_horizontal_rgba_rows_4_ar30;
_dispatch =
sse_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
}
}
_dispatch(src, src_stride, dst, dst_stride, &approx);
});

Expand All @@ -105,6 +113,14 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
_dispatch =
neon_convolve_horizontal_rgba_rows_ar30::<AR30_TYPE, AR30_ORDER>;
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
use crate::sse::sse_convolve_horizontal_rgba_rows_ar30;
_dispatch =
sse_convolve_horizontal_rgba_rows_ar30::<AR30_TYPE, AR30_ORDER>;
}
}
_dispatch(src, dst, &approx);
});
});
Expand Down Expand Up @@ -142,6 +158,14 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
}
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
use crate::sse::sse_convolve_horizontal_rgba_rows_4_ar30;
_dispatch =
sse_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
}
}
_dispatch(src, src_stride, dst, dst_stride, &approx);
});

Expand All @@ -159,6 +183,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
use crate::neon::neon_convolve_horizontal_rgba_rows_ar30;
_dispatch = neon_convolve_horizontal_rgba_rows_ar30::<AR30_TYPE, AR30_ORDER>;
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if std::arch::is_x86_feature_detected!("sse4.1") {
use crate::sse::sse_convolve_horizontal_rgba_rows_ar30;
_dispatch = sse_convolve_horizontal_rgba_rows_ar30::<AR30_TYPE, AR30_ORDER>;
}
}
_dispatch(src, dst, &approx);
});
}
Expand Down Expand Up @@ -231,7 +262,6 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
}

let row = &mut row[0..4 * width];

_dispatch(&bounds, src, row, src_stride, weights);
});
});
Expand Down Expand Up @@ -285,7 +315,6 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
}

let row = &mut row[0..4 * width];

_dispatch(&bounds, src, row, src_stride, weights);
});
}
Expand Down
Loading

0 comments on commit 6c0c783

Please sign in to comment.