Skip to content

Commit

Permalink
Merge pull request #33 from awxkee/dev
Browse files Browse the repository at this point in the history
Refactor and improvements
  • Loading branch information
awxkee authored Jan 8, 2025
2 parents e6c7e64 + cbed209 commit 9e3da32
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 351 deletions.
28 changes: 13 additions & 15 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ fn resize_plane(

fn main() {
// test_fast_image();
let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
let img = ImageReader::open("./assets/asset_4.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -56,11 +56,11 @@ fn main() {

// resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);

let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
// let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();

//
let store =
ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
.unwrap();

let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
Expand All @@ -75,17 +75,15 @@ fn main() {
// )
// .unwrap();

let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
dimensions.0 as usize / 4,
dimensions.1 as usize / 4,
let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
dimensions.0 as usize / 2,
dimensions.1 as usize / 2,
10,
);

// for i in 0..25 {
let start_time = Instant::now();
scaler
.resize_rgba_u16(&store, &mut dst_store, true)
.unwrap();
scaler.resize_rgba(&store, &mut dst_store, true).unwrap();

let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -163,13 +161,13 @@ fn main() {
// .map(|&x| (x * 255f32) as u8)
// .collect();

let dst: Vec<u8> = dst_store
.as_bytes()
.iter()
.map(|&x| (x >> 2) as u8)
.collect();
// let dst: Vec<u8> = dst_store
// .as_bytes()
// .iter()
// .map(|&x| (x >> 2) as u8)
// .collect();

// let dst = dst_store.as_bytes();
let dst = dst_store.as_bytes();
// let dst = resized;
// image::save_buffer(
// "converted.png",
Expand Down
52 changes: 17 additions & 35 deletions src/avx2/alpha_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,44 +62,29 @@ struct AssociateAlphaDefault {}
impl AssociateAlphaDefault {
#[inline(always)]
unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) {
let shuffle = _mm256_setr_epi8(
3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11,
11, 11, 15, 15, 15, 15,
);
let src_ptr = src.as_ptr();
let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
let multiplicand = _mm256_shuffle_epi8(rgba0, shuffle);

let zeros = _mm256_setzero_si256();

let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);

let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);

let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
let mut v_ll = _mm256_unpacklo_epi8(rgba0, zeros);
let mut v_hi = _mm256_unpackhi_epi8(rgba0, zeros);

let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
let a_lo = _mm256_unpacklo_epi8(multiplicand, zeros);
let a_hi = _mm256_unpackhi_epi8(multiplicand, zeros);

rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
v_ll = avx2_div_by255(_mm256_mullo_epi16(v_ll, a_lo));
v_hi = avx2_div_by255(_mm256_mullo_epi16(v_hi, a_hi));

let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
let values = _mm256_packus_epi16(v_ll, v_hi);

let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
let dst_ptr = dst.as_mut_ptr();
_mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
_mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
_mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
_mm256_storeu_si256(dst_ptr as *mut __m256i, values);
}
}

Expand All @@ -109,18 +94,15 @@ impl AssociateAlpha for AssociateAlphaDefault {
let mut rem = dst;
let mut src_rem = src;

for (dst, src) in rem
.chunks_exact_mut(32 * 4)
.zip(src_rem.chunks_exact(32 * 4))
{
for (dst, src) in rem.chunks_exact_mut(32).zip(src_rem.chunks_exact(32)) {
self.associate_chunk(dst, src);
}

rem = rem.chunks_exact_mut(32 * 4).into_remainder();
src_rem = src_rem.chunks_exact(32 * 4).remainder();
rem = rem.chunks_exact_mut(32).into_remainder();
src_rem = src_rem.chunks_exact(32).remainder();

if !rem.is_empty() {
const PART_SIZE: usize = 32 * 4;
const PART_SIZE: usize = 32;
assert!(src_rem.len() < PART_SIZE);
assert!(rem.len() < PART_SIZE);
assert_eq!(src_rem.len(), rem.len());
Expand Down
4 changes: 4 additions & 0 deletions src/avx2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ mod alpha_u16;
mod alpha_u8;
mod check_alpha;
mod rgb_u8;
#[cfg(feature = "nightly_avx512")]
mod rgb_u8_dot_i8;
#[cfg(feature = "half")]
mod rgba_f16;
mod rgba_f32;
#[cfg(feature = "nightly_avx512")]
mod rgba_u8_dot_lp;
mod rgba_u8_lb;
pub(crate) mod utils;
Expand All @@ -48,6 +50,7 @@ mod vertical_u16_lb;
mod vertical_u8;
mod vertical_u8_lp;

#[cfg(feature = "nightly_avx512")]
pub(crate) use crate::avx2::rgba_u8_dot_lp::{
convolve_horizontal_rgba_row_dot, convolve_horizontal_rgba_rows_4_dot,
};
Expand All @@ -62,6 +65,7 @@ pub(crate) use check_alpha::{
avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8,
};
pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4};
#[cfg(feature = "nightly_avx512")]
pub(crate) use rgb_u8_dot_i8::{
convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8,
};
Expand Down
Loading

0 comments on commit 9e3da32

Please sign in to comment.