From 5e9cc7578f77dc78421b698a1fdd037ae9363c09 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 16:07:10 +0000 Subject: [PATCH 01/10] Test avx2 --- app/Cargo.toml | 2 +- app/src/main.rs | 20 ++++++++--------- src/avx2/alpha_u8.rs | 52 +++++++++++++++----------------------------- 3 files changed, 28 insertions(+), 46 deletions(-) diff --git a/app/Cargo.toml b/app/Cargo.toml index 4b7e008..1c09c6d 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true } +pic-scale = { path = "..", features = ["half"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } diff --git a/app/src/main.rs b/app/src/main.rs index 935f996..a5cd157 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -56,11 +56,11 @@ fn main() { // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); - let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); + // let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); // let store = - ImageStore::::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize) + ImageStore::::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize) .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); @@ -75,7 +75,7 @@ fn main() { // ) // .unwrap(); - let mut dst_store = ImageStoreMut::::alloc_with_depth( + let mut dst_store = ImageStoreMut::::alloc_with_depth( dimensions.0 as usize / 4, dimensions.1 as usize / 4, 10, @@ -84,7 +84,7 @@ fn main() { // for i in 0..25 { let start_time = Instant::now(); scaler - .resize_rgba_u16(&store, &mut dst_store, true) + .resize_rgba(&store, &mut dst_store, true) .unwrap(); let elapsed_time = start_time.elapsed(); @@ -163,13 +163,13 @@ fn main() { // .map(|&x| (x * 255f32) as u8) // .collect(); - let dst: Vec = dst_store - .as_bytes() - .iter() - .map(|&x| (x >> 2) as u8) - .collect(); + // let dst: Vec = dst_store + // .as_bytes() + // .iter() + // .map(|&x| (x >> 2) as u8) + // .collect(); - // let dst = dst_store.as_bytes(); + let dst = dst_store.as_bytes(); // let dst = resized; // image::save_buffer( // "converted.png", diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index a573f36..048c1d8 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -62,44 +62,29 @@ struct AssociateAlphaDefault {} impl AssociateAlphaDefault { #[inline(always)] unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) { + let shuffle = _mm256_setr_epi8( + 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, + 11, 11, 15, 15, 15, 15, + ); let src_ptr = src.as_ptr(); let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); + let multiplicand = _mm256_shuffle_epi8(rgba0, shuffle); let zeros = _mm256_setzero_si256(); - let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); - let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); - - let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); - let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); - - let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); - let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); + let mut v_ll = _mm256_unpacklo_epi8(rgba0, zeros); + let mut v_hi = _mm256_unpackhi_epi8(rgba0, zeros); - let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); - let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); + let a_lo = _mm256_unpacklo_epi8(multiplicand, zeros); + let a_hi = _mm256_unpackhi_epi8(multiplicand, zeros); - rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); - rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); - ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low)); - ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high)); - bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); - bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); + v_ll = avx2_div_by255(_mm256_mullo_epi16(v_ll, a_lo)); + v_hi = avx2_div_by255(_mm256_mullo_epi16(v_hi, a_hi)); - let rrr = _mm256_packus_epi16(rrr_low, rrr_high); - let ggg = _mm256_packus_epi16(ggg_low, ggg_high); - let bbb = _mm256_packus_epi16(bbb_low, bbb_high); + let values = _mm256_packus_epi16(v_ll, v_hi); - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); let dst_ptr = dst.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, values); } } @@ -109,18 +94,15 @@ impl AssociateAlpha for AssociateAlphaDefault { let mut rem = dst; let mut src_rem = src; - for (dst, src) in rem - .chunks_exact_mut(32 * 4) - .zip(src_rem.chunks_exact(32 * 4)) - { + for (dst, src) in rem.chunks_exact_mut(32).zip(src_rem.chunks_exact(32)) { self.associate_chunk(dst, src); } - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); - src_rem = src_rem.chunks_exact(32 * 4).remainder(); + rem = rem.chunks_exact_mut(32).into_remainder(); + src_rem = src_rem.chunks_exact(32).remainder(); if !rem.is_empty() { - const PART_SIZE: usize = 32 * 4; + const PART_SIZE: usize = 32; assert!(src_rem.len() < PART_SIZE); assert!(rem.len() < PART_SIZE); assert_eq!(src_rem.len(), rem.len()); From 3e8e8cfa18faa6f707343ea5727c71d458c75503 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 17:47:49 +0000 Subject: [PATCH 02/10] Test different alpha algorithm --- src/avx2/alpha_u8.rs | 90 ++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 58 deletions(-) diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 048c1d8..47a1ce9 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -28,7 +28,7 @@ */ use crate::avx2::utils::{ - _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba, + _mm256_select_si256, avx2_div_by255, }; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -183,73 +183,47 @@ struct Avx2DisassociateAlpha {} impl Avx2DisassociateAlpha { #[inline(always)] - unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i { - let zeros = _mm256_setzero_si256(); - let lo = _mm256_unpacklo_epi8(x, zeros); - let hi = _mm256_unpackhi_epi8(x, zeros); - - let is_zero_mask = _mm256_cmpeq_epi8(a, zeros); + unsafe fn avx2_unpremultiply_row(&self, x: __m256i) -> __m256i { - let scale_ps = _mm256_set1_ps(255f32); + let is_zero_mask = _mm256_set1_epi32(0xff000000u32 as i32); - let lo_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)), - scale_ps, - ); - let lo_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)), - scale_ps, - ); - let hi_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)), - scale_ps, + let shuffle_lo = _mm256_setr_epi8( + 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, + 5, 4, 5, ); - let hi_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), - scale_ps, + + let shuffle_hi = _mm256_setr_epi8( + 8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13, 8, 9, 8, 9, 8, 9, 8, 9, 12, 13, + 12, 13, 12, 13, 12, 13, ); - let a_lo = _mm256_unpacklo_epi8(a, zeros); - let a_hi = _mm256_unpackhi_epi8(a, zeros); - let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); - let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); - let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); - let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); - - let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo))); - let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi))); - let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo))); - let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi))); - - _mm256_select_si256( - is_zero_mask, - zeros, - _mm256_packus_epi16( - _mm256_packus_epi32(lo_lo, lo_hi), - _mm256_packus_epi32(hi_lo, hi_hi), - ), - ) + + let scale_ps = _mm256_set1_ps((255 * 257) as f32); + let alpha_cvt = _mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(x)); + let numer = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_rcp_ps(alpha_cvt), scale_ps)); + + let px_lo = _mm256_unpacklo_epi8(x, x); + let px_hi = _mm256_unpackhi_epi8(x, x); + + let numer_lo = _mm256_shuffle_epi8(numer, shuffle_lo); + let numer_hi = _mm256_shuffle_epi8(numer, shuffle_hi); + + let v_lo = _mm256_mulhi_epu16(px_lo, numer_lo); + let v_hi = _mm256_mulhi_epu16(px_hi, numer_hi); + + // _mm256_select_si256(is_zero_mask, zeros, _mm256_packus_epi16(v_lo, v_hi)) + let alpha = _mm256_and_si256(x, is_zero_mask); + _mm256_blendv_epi8(_mm256_packus_epi16(v_lo, v_hi), alpha, is_zero_mask) } #[inline(always)] unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) { let src_ptr = in_place.as_ptr(); let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); - let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); - let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); - let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); - let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let rrr = self.avx2_unpremultiply_row(rrr, aaa); - let ggg = self.avx2_unpremultiply_row(ggg, aaa); - let bbb = self.avx2_unpremultiply_row(bbb, aaa); - - let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); + let rrr = self.avx2_unpremultiply_row(rgba0); let dst_ptr = in_place.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); - _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); - _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); - _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rrr); } } @@ -258,14 +232,14 @@ impl DisassociateAlpha for Avx2DisassociateAlpha { unsafe fn disassociate(&self, in_place: &mut [u8]) { let mut rem = in_place; - for dst in rem.chunks_exact_mut(32 * 4) { + for dst in rem.chunks_exact_mut(32) { self.disassociate_chunk(dst); } - rem = rem.chunks_exact_mut(32 * 4).into_remainder(); + rem = rem.chunks_exact_mut(32).into_remainder(); if !rem.is_empty() { - const PART_SIZE: usize = 32 * 4; + const PART_SIZE: usize = 32; assert!(rem.len() < PART_SIZE); let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; From 2b6f0bf5a7c065685a3115499eb56fe1557230c7 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 17:51:48 +0000 Subject: [PATCH 03/10] Revert "Test different alpha algorithm" This reverts commit 3e8e8cfa18faa6f707343ea5727c71d458c75503. --- src/avx2/alpha_u8.rs | 90 ++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 47a1ce9..048c1d8 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -28,7 +28,7 @@ */ use crate::avx2::utils::{ - _mm256_select_si256, avx2_div_by255, + _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba, }; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -183,47 +183,73 @@ struct Avx2DisassociateAlpha {} impl Avx2DisassociateAlpha { #[inline(always)] - unsafe fn avx2_unpremultiply_row(&self, x: __m256i) -> __m256i { + unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i { + let zeros = _mm256_setzero_si256(); + let lo = _mm256_unpacklo_epi8(x, zeros); + let hi = _mm256_unpackhi_epi8(x, zeros); - let is_zero_mask = _mm256_set1_epi32(0xff000000u32 as i32); + let is_zero_mask = _mm256_cmpeq_epi8(a, zeros); - let shuffle_lo = _mm256_setr_epi8( - 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, - 5, 4, 5, - ); + let scale_ps = _mm256_set1_ps(255f32); - let shuffle_hi = _mm256_setr_epi8( - 8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13, 8, 9, 8, 9, 8, 9, 8, 9, 12, 13, - 12, 13, 12, 13, 12, 13, + let lo_lo = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)), + scale_ps, ); - - let scale_ps = _mm256_set1_ps((255 * 257) as f32); - let alpha_cvt = _mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(x)); - let numer = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_rcp_ps(alpha_cvt), scale_ps)); - - let px_lo = _mm256_unpacklo_epi8(x, x); - let px_hi = _mm256_unpackhi_epi8(x, x); - - let numer_lo = _mm256_shuffle_epi8(numer, shuffle_lo); - let numer_hi = _mm256_shuffle_epi8(numer, shuffle_hi); - - let v_lo = _mm256_mulhi_epu16(px_lo, numer_lo); - let v_hi = _mm256_mulhi_epu16(px_hi, numer_hi); - - // _mm256_select_si256(is_zero_mask, zeros, _mm256_packus_epi16(v_lo, v_hi)) - let alpha = _mm256_and_si256(x, is_zero_mask); - _mm256_blendv_epi8(_mm256_packus_epi16(v_lo, v_hi), alpha, is_zero_mask) + let lo_hi = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)), + scale_ps, + ); + let hi_lo = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)), + scale_ps, + ); + let hi_hi = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), + scale_ps, + ); + let a_lo = _mm256_unpacklo_epi8(a, zeros); + let a_hi = _mm256_unpackhi_epi8(a, zeros); + let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); + let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); + let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); + let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); + + let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo))); + let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi))); + let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo))); + let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi))); + + _mm256_select_si256( + is_zero_mask, + zeros, + _mm256_packus_epi16( + _mm256_packus_epi32(lo_lo, lo_hi), + _mm256_packus_epi32(hi_lo, hi_hi), + ), + ) } #[inline(always)] unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) { let src_ptr = in_place.as_ptr(); let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i); + let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let rrr = self.avx2_unpremultiply_row(rgba0); + let rrr = self.avx2_unpremultiply_row(rrr, aaa); + let ggg = self.avx2_unpremultiply_row(ggg, aaa); + let bbb = self.avx2_unpremultiply_row(bbb, aaa); + + let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); let dst_ptr = in_place.as_mut_ptr(); - _mm256_storeu_si256(dst_ptr as *mut __m256i, rrr); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); } } @@ -232,14 +258,14 @@ impl DisassociateAlpha for Avx2DisassociateAlpha { unsafe fn disassociate(&self, in_place: &mut [u8]) { let mut rem = in_place; - for dst in rem.chunks_exact_mut(32) { + for dst in rem.chunks_exact_mut(32 * 4) { self.disassociate_chunk(dst); } - rem = rem.chunks_exact_mut(32).into_remainder(); + rem = rem.chunks_exact_mut(32 * 4).into_remainder(); if !rem.is_empty() { - const PART_SIZE: usize = 32; + const PART_SIZE: usize = 32 * 4; assert!(rem.len() < PART_SIZE); let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE]; From 67387215adde5f436a831a0241e32e406e854196 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 17:52:11 +0000 Subject: [PATCH 04/10] Rolled back tests --- app/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/Cargo.toml b/app/Cargo.toml index 1c09c6d..71f0b9f 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half"], default-features = true } +pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } From f90291478c5d2e9449432b956ae1b10f7ff94391 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 17:58:56 +0000 Subject: [PATCH 05/10] Testing new alpha un association --- src/avx2/alpha_u8.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 048c1d8..4a65325 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -208,12 +208,13 @@ impl Avx2DisassociateAlpha { _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), scale_ps, ); - let a_lo = _mm256_unpacklo_epi8(a, zeros); - let a_hi = _mm256_unpackhi_epi8(a, zeros); - let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); - let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); - let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); - let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); + + let alphas = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(a))); + + let a_lo_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(0, 0, 0, 0, 4, 4, 4, 4)); + let a_lo_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(1, 1, 1, 1, 5, 5, 5, 5)); + let a_hi_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(2, 2, 2, 2, 6, 6, 6, 6)); + let a_hi_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(3, 3, 3, 3, 7, 7, 7, 7)); let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo))); let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi))); From a5a99addfd80f23f00b8231f6629a8882b4aaf0b Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 18:08:23 +0000 Subject: [PATCH 06/10] Revert "Testing new alpha un association" This reverts commit f90291478c5d2e9449432b956ae1b10f7ff94391. --- src/avx2/alpha_u8.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 4a65325..048c1d8 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -208,13 +208,12 @@ impl Avx2DisassociateAlpha { _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), scale_ps, ); - - let alphas = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(a))); - - let a_lo_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(0, 0, 0, 0, 4, 4, 4, 4)); - let a_lo_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(1, 1, 1, 1, 5, 5, 5, 5)); - let a_hi_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(2, 2, 2, 2, 6, 6, 6, 6)); - let a_hi_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(3, 3, 3, 3, 7, 7, 7, 7)); + let a_lo = _mm256_unpacklo_epi8(a, zeros); + let a_hi = _mm256_unpackhi_epi8(a, zeros); + let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); + let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); + let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); + let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo))); let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi))); From c14af4c88da6dcef8a1e168639384a9647ca52d7 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Tue, 7 Jan 2025 18:17:19 +0000 Subject: [PATCH 07/10] Small improvements --- src/neon/rgb_u8.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs index af240f5..15ca20c 100644 --- a/src/neon/rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -70,8 +70,7 @@ unsafe fn conv_horiz_rgba_2_u8( ) -> int32x4_t { const COMPONENTS: usize = 3; let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let mut rgb_pixel = vdup_n_u32(0); - rgb_pixel = vset_lane_u32::<0>((src_ptr.as_ptr() as *const u32).read_unaligned(), rgb_pixel); + let mut rgb_pixel = vld1_lane_u32(src_ptr.as_ptr() as *const u32, vdup_n_u32(0)); rgb_pixel = vreinterpret_u32_u16(vset_lane_u16::<2>( (src_ptr.get_unchecked(4..).as_ptr() as *const u16).read_unaligned(), vreinterpret_u16_u32(rgb_pixel), From 7db394bd429ae39e63c3c47d9915286d72c7ceb9 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 8 Jan 2025 09:05:26 +0000 Subject: [PATCH 08/10] Refactor --- app/src/main.rs | 10 +- src/avx2/mod.rs | 4 + src/avx2/rgb_u8_dot_i8.rs | 201 +++++++------------------------------ src/avx2/rgba_u8_dot_lp.rs | 161 ++++++++--------------------- src/avx2/utils.rs | 4 + src/dispatch_group_u8.rs | 10 +- src/plane_u8.rs | 9 +- src/rgb_u8.rs | 16 ++- src/rgba_u8.rs | 19 +++- 9 files changed, 129 insertions(+), 305 deletions(-) diff --git a/app/src/main.rs b/app/src/main.rs index a5cd157..e1489a9 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -43,7 +43,7 @@ fn resize_plane( fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png") + let img = ImageReader::open("./assets/asset_4.png") .unwrap() .decode() .unwrap(); @@ -76,16 +76,14 @@ fn main() { // .unwrap(); let mut dst_store = ImageStoreMut::::alloc_with_depth( - dimensions.0 as usize / 4, - dimensions.1 as usize / 4, + dimensions.0 as usize / 2, + dimensions.1 as usize / 2, 10, ); // for i in 0..25 { let start_time = Instant::now(); - scaler - .resize_rgba(&store, &mut dst_store, true) - .unwrap(); + scaler.resize_rgba(&store, &mut dst_store, true).unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 37e75f4..1309080 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -34,10 +34,12 @@ mod alpha_u16; mod alpha_u8; mod check_alpha; mod rgb_u8; +#[cfg(feature = "nightly_avx512")] mod rgb_u8_dot_i8; #[cfg(feature = "half")] mod rgba_f16; mod rgba_f32; +#[cfg(feature = "nightly_avx512")] mod rgba_u8_dot_lp; mod rgba_u8_lb; pub(crate) mod utils; @@ -48,6 +50,7 @@ mod vertical_u16_lb; mod vertical_u8; mod vertical_u8_lp; +#[cfg(feature = "nightly_avx512")] pub(crate) use crate::avx2::rgba_u8_dot_lp::{ convolve_horizontal_rgba_row_dot, convolve_horizontal_rgba_rows_4_dot, }; @@ -62,6 +65,7 @@ pub(crate) use check_alpha::{ avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8, }; pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4}; +#[cfg(feature = "nightly_avx512")] pub(crate) use rgb_u8_dot_i8::{ convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8, }; diff --git a/src/avx2/rgb_u8_dot_i8.rs b/src/avx2/rgb_u8_dot_i8.rs index 870fd67..e444098 100644 --- a/src/avx2/rgb_u8_dot_i8.rs +++ b/src/avx2/rgb_u8_dot_i8.rs @@ -54,18 +54,13 @@ pub(crate) fn convolve_horizontal_rgb_avx_rows_4_i8( filter_weights: &FilterWeights, ) { unsafe { - #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avxvnni") { - return convolve_horizontal_rgb_avx_rows_i8_4_dot( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); - } - - convolve_horizontal_rgb_avx_rows_i8_4_ubs(src, src_stride, dst, dst_stride, filter_weights); + convolve_horizontal_rgb_avx_rows_i8_4_impl( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } } @@ -113,43 +108,8 @@ unsafe fn make_tuple_x8(pixel: __m128i, pixel2: __m128i, shuf: __m256i) -> __m25 ) } -#[cfg(feature = "nightly_avx512")] #[target_feature(enable = "avx2", enable = "avxvnni")] -unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_dot( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgb_avx_rows_i8_4_impl::( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); -} - -#[target_feature(enable = "avx2")] -unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_ubs( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgb_avx_rows_i8_4_impl::( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); -} - -#[inline(always)] -unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl( +unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -160,6 +120,7 @@ unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl( const PRECISION: i32 = 7; const ROUNDING_CONST: i32 = 1 << (PRECISION - 1); + const DOT: bool = true; let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1); @@ -176,52 +137,18 @@ unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl( -1, -1, -1, -1, ); - let vld = if DOT { - _mm_set1_epi32(ROUNDING_CONST) - } else { - _mm_setr_epi16( - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - 0, - 0, - ) - }; - - let vld_avx = if DOT { - _mm256_setr_epi32( - ROUNDING_CONST, - ROUNDING_CONST, - ROUNDING_CONST, - 0, - 0, - 0, - 0, - 0, - ) - } else { - _mm256_setr_epi16( - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ) - }; + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let vld_avx = _mm256_setr_epi32( + ROUNDING_CONST, + ROUNDING_CONST, + ROUNDING_CONST, + 0, + 0, + 0, + 0, + 0, + ); let (row0_ref, rest) = dst.split_at_mut(dst_stride); let (row1_ref, rest) = rest.split_at_mut(dst_stride); @@ -405,11 +332,7 @@ pub(crate) fn convolve_horizontal_rgb_avx_row_i8_one( filter_weights: &FilterWeights, ) { unsafe { - #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avxvnni") { - return convolve_horizontal_rgb_avx_row_i8_dot_one_impl(src, dst, filter_weights); - } - convolve_horizontal_rgb_avx_row_i8_ubs_one_impl(src, dst, filter_weights); + convolve_horizontal_rgb_avx_row_i8_one_impl(src, dst, filter_weights); } } @@ -428,32 +351,14 @@ unsafe fn add_one_weight( _mm_udot8_epi16::(store_0, lo, weight0) } -#[cfg(feature = "nightly_avx512")] #[target_feature(enable = "avx2", enable = "avxvnni")] -unsafe fn convolve_horizontal_rgb_avx_row_i8_dot_one_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgb_avx_row_i8_one_impl::(src, dst, filter_weights); -} - -#[target_feature(enable = "avx2")] -unsafe fn convolve_horizontal_rgb_avx_row_i8_ubs_one_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgb_avx_row_i8_one_impl::(src, dst, filter_weights); -} - -#[inline(always)] -unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl( +unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, ) { const CHANNELS: usize = 3; + const DOT: bool = true; let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1); @@ -478,52 +383,18 @@ unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl( const PRECISION: i32 = 7; const ROUNDING_CONST: i32 = 1 << (PRECISION - 1); - let vld = if DOT { - _mm_set1_epi32(ROUNDING_CONST) - } else { - _mm_setr_epi16( - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - 0, - 0, - ) - }; - - let vld_avx = if DOT { - _mm256_setr_epi32( - ROUNDING_CONST, - ROUNDING_CONST, - ROUNDING_CONST, - 0, - 0, - 0, - 0, - 0, - ) - } else { - _mm256_setr_epi16( - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - ROUNDING_CONST as i16, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ) - }; + let vld = _mm_set1_epi32(ROUNDING_CONST); + + let vld_avx = _mm256_setr_epi32( + ROUNDING_CONST, + ROUNDING_CONST, + ROUNDING_CONST, + 0, + 0, + 0, + 0, + 0, + ); for ((dst, bounds), weights) in dst .chunks_exact_mut(CHANNELS) diff --git a/src/avx2/rgba_u8_dot_lp.rs b/src/avx2/rgba_u8_dot_lp.rs index 1b7e01f..bddae95 100644 --- a/src/avx2/rgba_u8_dot_lp.rs +++ b/src/avx2/rgba_u8_dot_lp.rs @@ -33,46 +33,23 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -/// Will use `avxvnni` if available, if not `maddubs16` +/// Will use `avxvnni` if available /// /// `avxvnni` feature has slightly lower precision and won't work really well on huge kernel which /// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. /// /// # Safety -/// - Check `avx2` availability before the call. +/// - Check `avxvnni` availability before the call. pub(crate) fn convolve_horizontal_rgba_row_dot( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, ) { unsafe { - #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avxvnni") { - return convolve_horizontal_rgba_vnni_row_dot_impl(src, dst, filter_weights); - } - convolve_horizontal_rgba_ubs_row_dot_impl(src, dst, filter_weights); + convolve_horizontal_rgba_row_dot_impl(src, dst, filter_weights); } } -#[cfg(feature = "nightly_avx512")] -#[target_feature(enable = "avxvnni", enable = "avx2")] -unsafe fn convolve_horizontal_rgba_vnni_row_dot_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgba_row_dot_impl::(src, dst, filter_weights); -} - -#[target_feature(enable = "avx2")] -unsafe fn convolve_horizontal_rgba_ubs_row_dot_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgba_row_dot_impl::(src, dst, filter_weights); -} - #[inline(always)] fn compress_i32(x: __m128i) -> __m128i { unsafe { @@ -85,14 +62,15 @@ fn compress_i32(x: __m128i) -> __m128i { } } -#[inline(always)] -unsafe fn convolve_horizontal_rgba_row_dot_impl( +#[target_feature(enable = "avxvnni", enable = "avx2")] +unsafe fn convolve_horizontal_rgba_row_dot_impl( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, ) { const ROUNDING: i16 = 1 << (7 - 1); const CHANNELS: usize = 4; + const DOT: bool = true; let shuffle_weights_table = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); let shuffle_4_table = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); @@ -110,11 +88,7 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl( { let bounds_size = bounds.size; let mut jx = 0usize; - let mut store = if DOT { - _mm_set1_epi32(ROUNDING as i32) - } else { - _mm_setr_epi16(ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0) - }; + let mut store = _mm_set1_epi32(ROUNDING as i32); if bounds_size > 8 { let shuffle_avx_weights = _mm256_setr_epi8( @@ -127,22 +101,16 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl( 6, 10, 14, 3, 7, 11, 15, ); - let mut store_avx = if DOT { - _mm256_setr_epi32( - ROUNDING as i32, - ROUNDING as i32, - ROUNDING as i32, - ROUNDING as i32, - 0, - 0, - 0, - 0, - ) - } else { - _mm256_setr_epi16( - ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ) - }; + let mut store_avx = _mm256_setr_epi32( + ROUNDING as i32, + ROUNDING as i32, + ROUNDING as i32, + ROUNDING as i32, + 0, + 0, + 0, + 0, + ); while jx + 8 < bounds_size { let w_ptr = weights.get_unchecked(jx..(jx + 8)); @@ -229,13 +197,13 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl( } } -/// Will use `avxvnni` if available, if not `maddubs16` +/// Will use `avxvnni` if available /// /// `avxvnni` feature has slightly lower precision and won't work really well on huge kernel which /// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling. /// /// # Safety -/// - Check `avx2` availability before the call. +/// - Check `avxvnni` availability before the call. pub(crate) fn convolve_horizontal_rgba_rows_4_dot( src: &[u8], src_stride: usize, @@ -244,57 +212,18 @@ pub(crate) fn convolve_horizontal_rgba_rows_4_dot( filter_weights: &FilterWeights, ) { unsafe { - #[cfg(feature = "nightly_avx512")] - if std::arch::is_x86_feature_detected!("avxvnni") { - return convolve_horizontal_rgba_vnni_rows_4_dot( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); - } - convolve_horizontal_rgba_vnni_rows_4_ubs(src, src_stride, dst, dst_stride, filter_weights); + convolve_horizontal_rgba_vnni_rows_4_dot_impl( + src, + src_stride, + dst, + dst_stride, + filter_weights, + ); } } -#[cfg(feature = "nightly_avx512")] #[target_feature(enable = "avxvnni", enable = "avx2")] -unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgba_vnni_rows_4_dot_impl::( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); -} - -#[target_feature(enable = "avx2")] -unsafe fn convolve_horizontal_rgba_vnni_rows_4_ubs( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - convolve_horizontal_rgba_vnni_rows_4_dot_impl::( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); -} - -#[inline(always)] -unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl( +unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -305,28 +234,20 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl( const SCALE: i32 = 7; const ROUNDING: i16 = 1 << (SCALE - 1); - let init = if DOT { - _mm_set1_epi32(ROUNDING as i32) - } else { - _mm_setr_epi16(ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0) - }; - - let init_avx = if DOT { - _mm256_setr_epi32( - ROUNDING as i32, - ROUNDING as i32, - ROUNDING as i32, - ROUNDING as i32, - 0, - 0, - 0, - 0, - ) - } else { - _mm256_setr_epi16( - ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ) - }; + const DOT: bool = true; + + let init = _mm_set1_epi32(ROUNDING as i32); + + let init_avx = _mm256_setr_epi32( + ROUNDING as i32, + ROUNDING as i32, + ROUNDING as i32, + ROUNDING as i32, + 0, + 0, + 0, + 0, + ); let (row0_ref, rest) = dst.split_at_mut(dst_stride); let (row1_ref, rest) = rest.split_at_mut(dst_stride); diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs index 216e0e9..e2b9ebc 100644 --- a/src/avx2/utils.rs +++ b/src/avx2/utils.rs @@ -353,6 +353,7 @@ pub(crate) unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i { _mm_castps_si128(packed) } +#[allow(dead_code)] #[inline(always)] pub(crate) unsafe fn _mm256_dot16_avx_epi32( a: __m256i, @@ -373,6 +374,7 @@ pub(crate) unsafe fn _mm256_dot16_avx_epi32( } } +#[allow(dead_code)] #[inline(always)] pub(crate) unsafe fn _mm_udot8_epi16( a: __m128i, @@ -391,6 +393,7 @@ pub(crate) unsafe fn _mm_udot8_epi16( } } +#[allow(dead_code)] #[inline(always)] pub(crate) unsafe fn _mm256_udot8_epi16( a: __m256i, @@ -407,6 +410,7 @@ pub(crate) unsafe fn _mm256_udot8_epi16( _mm256_adds_epi16(a, _mm256_maddubs_epi16(b, c)) } +#[allow(dead_code)] #[inline(always)] pub(crate) unsafe fn _mm256_reduce_dot_epi16(a: __m256i) -> __m128i { #[cfg(feature = "nightly_avx512")] diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs index 2eda3d6..32f52d4 100644 --- a/src/dispatch_group_u8.rs +++ b/src/dispatch_group_u8.rs @@ -28,7 +28,6 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter}; use crate::image_store::ImageStoreMut; -use crate::support::PRECISION; use crate::ImageStore; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -100,12 +99,13 @@ pub(crate) fn convolve_horizontal_dispatch_u8( +pub(crate) fn convolve_vertical_dispatch_u8<'a, V: Copy + Send + Sync, const COMPONENTS: usize>( image_store: &ImageStore, filter_weights: FilterWeights, destination: &mut ImageStoreMut<'a, u8, COMPONENTS>, pool: &Option, - dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]), + dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[V]), + weights_converter: impl WeightsConverter, ) { let src_stride = image_store.stride(); let dst_stride = destination.stride(); @@ -113,9 +113,9 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( let dst_width = destination.width; if let Some(pool) = pool { + let approx = weights_converter.prepare_weights(&filter_weights); pool.install(|| { let destination_image = destination.buffer.borrow_mut(); - let approx = filter_weights.numerical_approximation_i16::(0); destination_image .par_chunks_exact_mut(dst_stride) .enumerate() @@ -136,7 +136,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( }); } else { let destination_image = destination.buffer.borrow_mut(); - let approx = filter_weights.numerical_approximation_i16::(0); + let approx = weights_converter.prepare_weights(&filter_weights); destination_image .chunks_exact_mut(dst_stride) .enumerate() diff --git a/src/plane_u8.rs b/src/plane_u8.rs index dc57c41..1021cd4 100644 --- a/src/plane_u8.rs +++ b/src/plane_u8.rs @@ -132,6 +132,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 1> { { _dispatcher = wasm_vertical_neon_row; } - convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_u8( + self, + filter_weights, + destination, + pool, + _dispatcher, + DefaultWeightsConverter::default(), + ); } } diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index ffe2362..43b81f3 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -84,10 +84,11 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { return; } } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "nightly_avx512"))] { - let has_avx2 = std::arch::is_x86_feature_detected!("avx2"); - if _scale_factor < 5.1 && has_avx2 { + // Precision is too low without vnni + let has_vnni = std::arch::is_x86_feature_detected!("avxvnni"); + if _scale_factor < 5.1 && has_vnni { use crate::avx2::{ convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8, }; @@ -195,6 +196,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 3> { { _dispatcher = wasm_vertical_neon_row; } - convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_u8( + self, + filter_weights, + destination, + pool, + _dispatcher, + DefaultWeightsConverter::default(), + ); } } diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index 2e9c384..e60f69e 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -90,10 +90,14 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { return; } } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + feature = "nightly_avx512" + ))] { - let has_avx2 = std::arch::is_x86_feature_detected!("avx2"); - if _scale_factor < 5.1 && has_avx2 { + // Precision is too low without vnni + let has_vnni = std::arch::is_x86_feature_detected!("avxvnni"); + if _scale_factor < 5.1 && has_vnni { use crate::avx2::{ convolve_horizontal_rgba_row_dot, convolve_horizontal_rgba_rows_4_dot, }; @@ -216,6 +220,13 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { { _dispatcher = wasm_vertical_neon_row; } - convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher); + convolve_vertical_dispatch_u8( + self, + filter_weights, + destination, + pool, + _dispatcher, + DefaultWeightsConverter::default(), + ); } } From 9f477d9e859455b648cd01526a0254d9d3754131 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 8 Jan 2025 09:20:04 +0000 Subject: [PATCH 09/10] Refactor --- app/Cargo.toml | 2 +- src/neon/rgb_u8.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/Cargo.toml b/app/Cargo.toml index 71f0b9f..4b7e008 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true } +pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs index 15ca20c..8a7fe7a 100644 --- a/src/neon/rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -70,7 +70,7 @@ unsafe fn conv_horiz_rgba_2_u8( ) -> int32x4_t { const COMPONENTS: usize = 3; let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let mut rgb_pixel = vld1_lane_u32(src_ptr.as_ptr() as *const u32, vdup_n_u32(0)); + let mut rgb_pixel = vld1_lane_u32::<0>(src_ptr.as_ptr() as *const u32, vdup_n_u32(0)); rgb_pixel = vreinterpret_u32_u16(vset_lane_u16::<2>( (src_ptr.get_unchecked(4..).as_ptr() as *const u16).read_unaligned(), vreinterpret_u16_u32(rgb_pixel), From cbed20911aaf4f7a0579a3bd56e25a3b426e85e3 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Wed, 8 Jan 2025 09:20:47 +0000 Subject: [PATCH 10/10] Refactor --- src/rgb_u8.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index 43b81f3..92f9690 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -84,7 +84,10 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { return; } } - #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "nightly_avx512"))] + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + feature = "nightly_avx512" + ))] { // Precision is too low without vnni let has_vnni = std::arch::is_x86_feature_detected!("avxvnni");