From d4d7ee411adf4008b5eb7c7ece23889bee3d0d4f Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 10 Jan 2025 08:55:03 +0000 Subject: [PATCH 1/3] Dropped i8mm --- Cargo.toml | 1 - app/Cargo.toml | 2 +- app/src/main.rs | 16 +- fuzz/Cargo.toml | 1 - src/avx2/mod.rs | 6 - src/avx2/rgb_u8_dot_i8.rs | 482 -------------------------------------- src/filter_weights.rs | 30 +-- src/lib.rs | 1 - src/neon/rgb_u8_dot.rs | 326 -------------------------- src/rgb_u8.rs | 59 ----- 10 files changed, 21 insertions(+), 903 deletions(-) delete mode 100644 src/avx2/rgb_u8_dot_i8.rs delete mode 100644 src/neon/rgb_u8_dot.rs diff --git a/Cargo.toml b/Cargo.toml index 4ffeb59..94e9301 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,5 +29,4 @@ libc = "0.2.158" default = ["colorspaces"] colorspaces = ["dep:colorutils-rs"] nightly_avx512 = [] -nightly_i8mm = [] nightly_avx512fp16 = ["nightly_avx512"] \ No newline at end of file diff --git a/app/Cargo.toml b/app/Cargo.toml index 1c09c6d..71f0b9f 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half"], default-features = true } +pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } diff --git a/app/src/main.rs b/app/src/main.rs index a7f3e34..36397d2 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -48,7 +48,7 @@ fn main() { .decode() .unwrap(); let dimensions = img.dimensions(); - let transient = img.to_luma_alpha8(); + let transient = img.to_rgb8(); let mut bytes = Vec::from(transient.as_bytes()); let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); @@ -60,7 +60,7 @@ fn main() { // let store = - ImageStore::::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize) + ImageStore::::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize) .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); @@ -75,15 +75,15 @@ fn main() { // ) // .unwrap(); - let mut dst_store = ImageStoreMut::::alloc_with_depth( - dimensions.0 as usize / 3, - dimensions.1 as usize / 3, + let mut dst_store = ImageStoreMut::::alloc_with_depth( + dimensions.0 as usize / 3 * 2, + dimensions.1 as usize / 3 * 2, 10, ); // for i in 0..25 { let start_time = Instant::now(); - scaler.resize_cbcr8(&store, &mut dst_store).unwrap(); + scaler.resize_rgb(&store, &mut dst_store).unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -189,11 +189,11 @@ fn main() { .unwrap(); } else { image::save_buffer( - "converted.webp", + "converted_o.png", &dst, dst_store.width as u32, dst_store.height as u32, - image::ColorType::La8, + image::ColorType::Rgb8, ) .unwrap(); } diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 9a60d02..973b015 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -12,7 +12,6 @@ libfuzzer-sys = "0.4" pic-scale = { path = "../" } [features] -nightly_i8mm = ["pic-scale/nightly_i8mm"] nightly_avx512 = ["pic-scale/nightly_avx512"] [[bin]] diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 30944b8..48aa472 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -34,8 +34,6 @@ mod alpha_u16; mod alpha_u8; mod check_alpha; mod rgb_u8; -#[cfg(feature = "nightly_avx512")] -mod rgb_u8_dot_i8; #[cfg(feature = "half")] mod rgba_f16; mod rgba_f32; @@ -59,10 +57,6 @@ pub(crate) use check_alpha::{ avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8, }; pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4}; -#[cfg(feature = "nightly_avx512")] -pub(crate) use rgb_u8_dot_i8::{ - convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8, -}; #[cfg(feature = "half")] pub(crate) use rgba_f16::{ convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16, diff --git a/src/avx2/rgb_u8_dot_i8.rs b/src/avx2/rgb_u8_dot_i8.rs deleted file mode 100644 index e444098..0000000 --- a/src/avx2/rgb_u8_dot_i8.rs +++ /dev/null @@ -1,482 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -use crate::avx2::utils::{_mm256_reduce_dot_epi16, _mm256_udot8_epi16, _mm_udot8_epi16}; -use crate::filter_weights::FilterWeights; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[inline(always)] -fn compress_i32(x: __m128i) -> __m128i { - unsafe { - if DOT { - let store_32 = _mm_srai_epi32::<7>(x); - _mm_packus_epi32(store_32, store_32) - } else { - _mm_srai_epi16::<7>(_mm_hadds_epi16(x, x)) - } - } -} - -pub(crate) fn convolve_horizontal_rgb_avx_rows_4_i8( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - unsafe { - convolve_horizontal_rgb_avx_rows_i8_4_impl( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); - } -} - -#[inline(always)] -unsafe fn load_rgb_x2(src: &[u8]) -> __m128i { - let mut rgb_pixel = _mm_setzero_si128(); - rgb_pixel = _mm_insert_epi32::<0>(rgb_pixel, (src.as_ptr() as *const i32).read_unaligned()); - rgb_pixel = _mm_insert_epi16::<2>( - rgb_pixel, - (src.get_unchecked(4..).as_ptr() as *const i16).read_unaligned() as i32, - ); - rgb_pixel -} - -#[inline(always)] -unsafe fn load_rgb_x4(src: &[u8]) -> __m128i { - let mut rgb_pixel = _mm_loadu_si64(src.as_ptr()); - rgb_pixel = _mm_insert_epi32::<2>( - rgb_pixel, - (src.get_unchecked(8..).as_ptr() as *const i32).read_unaligned(), - ); - rgb_pixel -} - -#[inline(always)] -unsafe fn load_distr_x8_rgb(src: &[u8], shuf: __m256i) -> __m256i { - let pixel_lo = _mm_loadu_si128(src.as_ptr() as *const _); - let pixel_hi = _mm_loadu_si64(src.get_unchecked(16..).as_ptr() as *const _); - - make_tuple_x8(pixel_lo, pixel_hi, shuf) -} - -#[inline(always)] -unsafe fn make_tuple_x8(pixel: __m128i, pixel2: __m128i, shuf: __m256i) -> __m256i { - // Low part - // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3] [R4 G4 B4] [R5] - // High part - // [G5, B5] [R6, G6, B6] [R7, G7, B7] - - let hi_part = _mm_alignr_epi8::<12>(pixel2, pixel); - - _mm256_shuffle_epi8( - _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(pixel), hi_part), - shuf, - ) -} - -#[target_feature(enable = "avx2", enable = "avxvnni")] -unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - const CHANNELS: usize = 3; - - const PRECISION: i32 = 7; - const ROUNDING_CONST: i32 = 1 << (PRECISION - 1); - const DOT: bool = true; - - let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1); - - let shuffle_weights = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); - - let weights_idx = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); - - let shuffle_weights01 = _mm256_setr_epi8( - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, - 2, 3, - ); - let shuffle_pixels_4 = _mm256_setr_epi8( - 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, - -1, -1, -1, -1, - ); - - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let vld_avx = _mm256_setr_epi32( - ROUNDING_CONST, - ROUNDING_CONST, - ROUNDING_CONST, - 0, - 0, - 0, - 0, - 0, - ); - - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3] - - if bounds.size > 4 { - let mut store0 = vld_avx; - let mut store1 = vld_avx; - let mut store2 = vld_avx; - let mut store3 = vld_avx; - - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let full_weights = - _mm256_castsi128_si256(_mm_loadu_si64(w_ptr.as_ptr() as *const _)); - - let w0 = _mm256_shuffle_epi8( - _mm256_permutevar8x32_epi32(full_weights, weights_idx), - shuffle_weights01, - ); - - let bounds_start = (bounds.start + jx) * CHANNELS; - - let rgb_pixel_0 = - load_distr_x8_rgb(src0.get_unchecked(bounds_start..), shuffle_pixels_4); - let rgb_pixel_1 = - load_distr_x8_rgb(src1.get_unchecked(bounds_start..), shuffle_pixels_4); - let rgb_pixel_2 = - load_distr_x8_rgb(src2.get_unchecked(bounds_start..), shuffle_pixels_4); - let rgb_pixel_3 = - load_distr_x8_rgb(src3.get_unchecked(bounds_start..), shuffle_pixels_4); - - store0 = _mm256_udot8_epi16::(store0, rgb_pixel_0, w0); - store1 = _mm256_udot8_epi16::(store1, rgb_pixel_1, w0); - store2 = _mm256_udot8_epi16::(store2, rgb_pixel_2, w0); - store3 = _mm256_udot8_epi16::(store3, rgb_pixel_3, w0); - - jx += 8; - } - - store_0 = _mm256_reduce_dot_epi16::(store0); - store_1 = _mm256_reduce_dot_epi16::(store1); - store_2 = _mm256_reduce_dot_epi16::(store2); - store_3 = _mm256_reduce_dot_epi16::(store3); - } - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - - let weight0 = - _mm_shuffle_epi8(_mm_loadu_si32(w_ptr.as_ptr() as *const u8), shuffle_weights); - let bounds_start = (bounds.start + jx) * CHANNELS; - - let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..)); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_v); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_v); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_v); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_v); - - store_0 = _mm_udot8_epi16::(store_0, lo_0, weight0); - store_1 = _mm_udot8_epi16::(store_1, lo_1, weight0); - store_2 = _mm_udot8_epi16::(store_2, lo_2, weight0); - store_3 = _mm_udot8_epi16::(store_3, lo_3, weight0); - - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = (bounds.start + jx) * CHANNELS; - let weight0 = - _mm_shuffle_epi8(_mm_loadu_si16(w_ptr.as_ptr() as *const u8), shuffle_weights); - - let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..)); - let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..)); - let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..)); - let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..)); - - let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_v); - let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_v); - let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_v); - let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_v); - - store_0 = _mm_udot8_epi16::(store_0, lo_0, weight0); - store_1 = _mm_udot8_epi16::(store_1, lo_1, weight0); - store_2 = _mm_udot8_epi16::(store_2, lo_2, weight0); - store_3 = _mm_udot8_epi16::(store_3, lo_3, weight0); - - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let bounds_start = bounds.start + jx; - - let weight0 = _mm_shuffle_epi8( - _mm_set1_epi8(w_ptr.as_ptr().read_unaligned()), - shuffle_weights, - ); - - store_0 = add_one_weight::(bounds_start, src0, weight0, store_0); - store_1 = add_one_weight::(bounds_start, src1, weight0, store_1); - store_2 = add_one_weight::(bounds_start, src2, weight0, store_2); - store_3 = add_one_weight::(bounds_start, src3, weight0, store_3); - jx += 1; - } - - let store_0_8 = compress_i32::(store_0); - let store_1_8 = compress_i32::(store_1); - let store_2_8 = compress_i32::(store_2); - let store_3_8 = compress_i32::(store_3); - - let store_0_8 = _mm_packus_epi16(store_0_8, store_0_8); - let store_1_8 = _mm_packus_epi16(store_1_8, store_1_8); - let store_2_8 = _mm_packus_epi16(store_2_8, store_2_8); - let store_3_8 = _mm_packus_epi16(store_3_8, store_3_8); - - let element_0 = _mm_extract_epi32::<0>(store_0_8); - let element_1 = _mm_extract_epi32::<0>(store_1_8); - let element_2 = _mm_extract_epi32::<0>(store_2_8); - let element_3 = _mm_extract_epi32::<0>(store_3_8); - - let bytes = element_0.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk0.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_1.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk1.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_2.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk2.get_unchecked_mut(2) = bytes[2]; - - let bytes = element_3.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *chunk3.get_unchecked_mut(2) = bytes[2]; - } -} - -pub(crate) fn convolve_horizontal_rgb_avx_row_i8_one( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - unsafe { - convolve_horizontal_rgb_avx_row_i8_one_impl(src, dst, filter_weights); - } -} - -#[inline(always)] -unsafe fn add_one_weight( - start_x: usize, - src: &[u8], - weight0: __m128i, - store_0: __m128i, -) -> __m128i { - const COMPONENTS: usize = 3; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..).as_ptr(); - let base_pixel = _mm_loadu_si16(src.as_ptr()); - let m_vl = _mm_insert_epi8::<2>(base_pixel, src_ptr.add(2).read_unaligned() as i32); - let lo = _mm_unpacklo_epi8(m_vl, _mm_setzero_si128()); - _mm_udot8_epi16::(store_0, lo, weight0) -} - -#[target_feature(enable = "avx2", enable = "avxvnni")] -unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - const CHANNELS: usize = 3; - const DOT: bool = true; - - let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1); - - let shuffle_weights = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); - - let weights_idx = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); - - let shuffle_weights01 = _mm256_setr_epi8( - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, - 2, 3, - ); - let shuffle_pixels_4 = _mm256_setr_epi8( - 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, - -1, -1, -1, -1, - ); - - // Low part - // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3] [R4 G4 B4] [R5] - // High part - // [G5, B5] [R6, G6, B6] [R7, G7, B7] - - const PRECISION: i32 = 7; - const ROUNDING_CONST: i32 = 1 << (PRECISION - 1); - - let vld = _mm_set1_epi32(ROUNDING_CONST); - - let vld_avx = _mm256_setr_epi32( - ROUNDING_CONST, - ROUNDING_CONST, - ROUNDING_CONST, - 0, - 0, - 0, - 0, - 0, - ); - - for ((dst, bounds), weights) in dst - .chunks_exact_mut(CHANNELS) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let bounds_size = bounds.size; - let mut jx = 0usize; - - let mut store = if bounds_size > 4 { - let mut store = vld_avx; - while jx + 8 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 8)); - let full_weights = - _mm256_castsi128_si256(_mm_loadu_si64(w_ptr.as_ptr() as *const _)); - - let w0 = _mm256_shuffle_epi8( - _mm256_permutevar8x32_epi32(full_weights, weights_idx), - shuffle_weights01, - ); - - let bounds_start = bounds.start + jx; - let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..); - - let pixel_lo = _mm_loadu_si128(src_ptr_0.as_ptr() as *const _); - let pixel_hi = _mm_loadu_si64(src_ptr_0.get_unchecked(16..).as_ptr() as *const _); - - let px = make_tuple_x8(pixel_lo, pixel_hi, shuffle_pixels_4); - - store = _mm256_udot8_epi16::(store, px, w0); - - jx += 8; - } - - _mm256_reduce_dot_epi16::(store) - } else { - vld - }; - - while jx + 4 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let weight0 = - _mm_shuffle_epi8(_mm_loadu_si32(w_ptr.as_ptr() as *const u8), shuffle_weights); - let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); - let rgb_pixel = load_rgb_x4(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_v); - store = _mm_udot8_epi16::(store, lo, weight0); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let weight0 = - _mm_shuffle_epi8(_mm_loadu_si16(w_ptr.as_ptr() as *const u8), shuffle_weights); - let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..); - let rgb_pixel = load_rgb_x2(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_v); - store = _mm_udot8_epi16::(store, lo, weight0); - jx += 2; - } - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let weight0 = _mm_shuffle_epi8( - _mm_set1_epi8(w_ptr.as_ptr().read_unaligned()), - shuffle_weights, - ); - store = add_one_weight::(bounds.start + jx, src, weight0, store); - jx += 1; - } - - let store_16_8 = compress_i32::(store); - let store_16_8 = _mm_packus_epi16(store_16_8, store_16_8); - - let element = _mm_extract_epi32::<0>(store_16_8); - let bytes = element.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *dst.get_unchecked_mut(2) = bytes[2]; - } -} diff --git a/src/filter_weights.rs b/src/filter_weights.rs index 0d0701f..a665d96 100644 --- a/src/filter_weights.rs +++ b/src/filter_weights.rs @@ -26,7 +26,7 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use num_traits::AsPrimitive; +use num_traits::{AsPrimitive, Bounded}; #[derive(Debug, Clone)] pub(crate) struct FilterWeights { @@ -79,7 +79,7 @@ impl FilterWeights { } pub(crate) fn numerical_approximation< - J: Clone + Default + Copy + 'static, + J: Clone + Default + Copy + 'static + Bounded + AsPrimitive, const PRECISION: i32, >( &self, @@ -97,13 +97,20 @@ impl FilterWeights { let mut output_kernel = vec![J::default(); self.distinct_elements * align]; + let lower_bound = J::min_value().as_(); + let upper_bound = J::max_value().as_(); + for (chunk, kernel_chunk) in self .weights .chunks_exact(self.kernel_size) .zip(output_kernel.chunks_exact_mut(align)) { for (&weight, kernel) in chunk.iter().zip(kernel_chunk) { - *kernel = (weight * precision_scale).round().as_(); + *kernel = (weight * precision_scale) + .round() + .min(upper_bound) + .max(lower_bound) + .as_(); } } @@ -131,7 +138,8 @@ pub(crate) trait WeightsConverter { #[derive(Default)] pub(crate) struct DefaultWeightsConverter {} -impl WeightsConverter for DefaultWeightsConverter +impl> WeightsConverter + for DefaultWeightsConverter where f32: AsPrimitive, { @@ -141,20 +149,6 @@ where } } -#[derive(Default)] -#[allow(dead_code)] -pub(crate) struct WeightsConverterQ7 {} - -#[allow(dead_code)] -impl WeightsConverter for WeightsConverterQ7 -where - f32: AsPrimitive, -{ - fn prepare_weights(&self, weights: &FilterWeights) -> FilterWeights { - weights.numerical_approximation::(0) - } -} - #[derive(Default)] #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] pub(crate) struct WeightFloat16Converter {} diff --git a/src/lib.rs b/src/lib.rs index c3c8b41..cdf959e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,7 +32,6 @@ #![cfg_attr(feature = "nightly_avx512", feature(cfg_version))] #![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))] #![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))] -#![cfg_attr(feature = "nightly_i8mm", feature(stdarch_neon_i8mm))] #![cfg_attr(feature = "nightly_avx512fp16", feature(stdarch_x86_avx512_f16))] mod alpha_check; diff --git a/src/neon/rgb_u8_dot.rs b/src/neon/rgb_u8_dot.rs deleted file mode 100644 index 0ac1ba6..0000000 --- a/src/neon/rgb_u8_dot.rs +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -use crate::filter_weights::FilterWeights; -use crate::neon::utils::{load_12b_as_u8x16, load_3b_as_u8x16, load_6b_as_u8x16}; -use std::arch::aarch64::*; - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_8_u8( - start_x: usize, - src: &[u8], - weights0: int8x16_t, - weights1: int8x16_t, - shuffle: uint8x16_t, - store: int32x4_t, -) -> int32x4_t { - const COMPONENTS: usize = 3; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let pixel_lo = vld1q_u8(src_ptr.as_ptr()); - let pixel_hi = vcombine_u8(vld1_u8(src_ptr.get_unchecked(16..).as_ptr()), vdup_n_u8(0)); - let created_new = vextq_u8::<12>(pixel_lo, pixel_hi); - let pixel0 = vqtbl1q_u8(pixel_lo, shuffle); - let pixel1 = vqtbl1q_u8(created_new, shuffle); - let v0 = vusdotq_s32(store, pixel0, weights0); - vusdotq_s32(v0, pixel1, weights1) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_4_u8( - start_x: usize, - src: &[u8], - weights: int8x16_t, - shuffle: uint8x16_t, - store: int32x4_t, -) -> int32x4_t { - const COMPONENTS: usize = 3; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let pixel = vqtbl1q_u8(load_12b_as_u8x16(src_ptr.as_ptr()), shuffle); - vusdotq_s32(store, pixel, weights) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_2_u8( - start_x: usize, - src: &[u8], - weights: int8x16_t, - shuffle: uint8x16_t, - store: int32x4_t, -) -> int32x4_t { - const COMPONENTS: usize = 3; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let rgb_pixel = vqtbl1q_u8(load_6b_as_u8x16(src_ptr.as_ptr()), shuffle); - vusdotq_s32(store, rgb_pixel, weights) -} - -#[must_use] -#[inline(always)] -unsafe fn conv_horiz_rgba_1_u8( - start_x: usize, - src: &[u8], - w0: int8x16_t, - shuf: uint8x16_t, - store: int32x4_t, -) -> int32x4_t { - const COMPONENTS: usize = 3; - let src_ptr = src.get_unchecked((start_x * COMPONENTS)..); - let rgb_pixel = vqtbl1q_u8(load_3b_as_u8x16(src_ptr.as_ptr()), shuf); - vusdotq_s32(store, rgb_pixel, w0) -} - -#[inline(always)] -unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) { - let store_16 = vqshrun_n_s32::<7>(store); - let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - vst1_lane_u16::<0>( - dst.as_mut_ptr() as *mut u16, - vreinterpret_u16_u8(store_16_8), - ); - vst1_lane_u8::<2>(dst.as_mut_ptr().add(2), store_16_8); -} - -pub(crate) fn convolve_horizontal_rgb_neon_dot_rows_4( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - unsafe { - convolve_horizontal_rgb_neon_dot_rows_4_impl( - src, - src_stride, - dst, - dst_stride, - filter_weights, - ); - } -} - -#[target_feature(enable = "i8mm")] -unsafe fn convolve_horizontal_rgb_neon_dot_rows_4_impl( - src: &[u8], - src_stride: usize, - dst: &mut [u8], - dst_stride: usize, - filter_weights: &FilterWeights, -) { - let shuffle_v_table: [u8; 16] = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 255, 255, 255, 255]; - let shuffle_v = vld1q_u8(shuffle_v_table.as_ptr()); - let weights_shuffle_table: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; - let weights_shuffle = vld1q_u8(weights_shuffle_table.as_ptr()); - let weights_shuffle_table1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; - let weights_shuffle1 = vld1q_u8(weights_shuffle_table1.as_ptr()); - - // (r0 g0 b0 r1) (g2 b2 r3 g3) (b3 r4 g4 b4) (r5 g5 b5 r6) - - const CHANNELS: usize = 3; - const ROUNDING_CONST: i32 = 1 << 6; - let init = vdupq_n_s32(ROUNDING_CONST); - let (row0_ref, rest) = dst.split_at_mut(dst_stride); - let (row1_ref, rest) = rest.split_at_mut(dst_stride); - let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride); - - let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS); - let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS); - let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS); - let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS); - - for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0 - .zip(iter_row1) - .zip(iter_row2) - .zip(iter_row3) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let mut jx = 0usize; - let mut store_0 = init; - let mut store_1 = init; - let mut store_2 = init; - let mut store_3 = init; - - let src0 = src; - let src1 = src0.get_unchecked(src_stride..); - let src2 = src1.get_unchecked(src_stride..); - let src3 = src2.get_unchecked(src_stride..); - - while jx + 8 < bounds.size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vreinterpretq_s8_s64(vld1q_dup_s64(w_ptr.as_ptr() as *const _)); - let weights0 = vqtbl1q_s8(weights, weights_shuffle); - let weights1 = vqtbl1q_s8(weights, weights_shuffle1); - - store_0 = - conv_horiz_rgba_8_u8(bounds_start, src0, weights0, weights1, shuffle_v, store_0); - store_1 = - conv_horiz_rgba_8_u8(bounds_start, src1, weights0, weights1, shuffle_v, store_1); - store_2 = - conv_horiz_rgba_8_u8(bounds_start, src2, weights0, weights1, shuffle_v, store_2); - store_3 = - conv_horiz_rgba_8_u8(bounds_start, src3, weights0, weights1, shuffle_v, store_3); - jx += 8; - } - - while jx + 4 < bounds.size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vqtbl1q_s8( - vreinterpretq_s8_s32(vld1q_dup_s32(w_ptr.as_ptr() as *const _)), - weights_shuffle, - ); - store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, shuffle_v, store_0); - store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, shuffle_v, store_1); - store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, shuffle_v, store_2); - store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, shuffle_v, store_3); - jx += 4; - } - - while jx + 2 < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bnds = bounds.start + jx; - let v_weight = vqtbl1q_s8( - vreinterpretq_s8_s16(vld1q_dup_s16(w_ptr.as_ptr() as *const _)), - weights_shuffle, - ); - store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, shuffle_v, store_0); - store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, shuffle_v, store_1); - store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, shuffle_v, store_2); - store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, shuffle_v, store_3); - jx += 2; - } - - while jx < bounds.size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let bnds = bounds.start + jx; - let weight0 = vqtbl1q_s8(vld1q_dup_s8(w_ptr.as_ptr()), weights_shuffle); - store_0 = conv_horiz_rgba_1_u8(bnds, src0, weight0, shuffle_v, store_0); - store_1 = conv_horiz_rgba_1_u8(bnds, src1, weight0, shuffle_v, store_1); - store_2 = conv_horiz_rgba_1_u8(bnds, src2, weight0, shuffle_v, store_2); - store_3 = conv_horiz_rgba_1_u8(bnds, src3, weight0, shuffle_v, store_3); - jx += 1; - } - - write_accumulator_u8(store_0, chunk0); - write_accumulator_u8(store_1, chunk1); - write_accumulator_u8(store_2, chunk2); - write_accumulator_u8(store_3, chunk3); - } -} - -pub(crate) fn convolve_horizontal_rgb_neon_dot_row_one( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - unsafe { - convolve_horizontal_rgb_neon_dot_row_one_impl(src, dst, filter_weights); - } -} - -#[target_feature(enable = "i8mm")] -unsafe fn convolve_horizontal_rgb_neon_dot_row_one_impl( - src: &[u8], - dst: &mut [u8], - filter_weights: &FilterWeights, -) { - const CHANNELS: usize = 3; - const ROUNDING_CONST: i32 = 1 << 6; - - let shuffle_v_table: [u8; 16] = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 255, 255, 255, 255]; - let shuffle_v = vld1q_u8(shuffle_v_table.as_ptr()); - let weights_shuffle_table: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]; - let weights_shuffle = vld1q_u8(weights_shuffle_table.as_ptr()); - let weights_shuffle_table1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7]; - let weights_shuffle1 = vld1q_u8(weights_shuffle_table1.as_ptr()); - - for ((dst, bounds), weights) in dst - .chunks_exact_mut(CHANNELS) - .zip(filter_weights.bounds.iter()) - .zip( - filter_weights - .weights - .chunks_exact(filter_weights.aligned_size), - ) - { - let bounds_size = bounds.size; - - let mut jx = 0usize; - let mut store = vdupq_n_s32(ROUNDING_CONST); - - while jx + 8 < bounds_size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vreinterpretq_s8_s64(vld1q_dup_s64(w_ptr.as_ptr() as *const _)); - let weights0 = vqtbl1q_s8(weights, weights_shuffle); - let weights1 = vqtbl1q_s8(weights, weights_shuffle1); - store = conv_horiz_rgba_8_u8(bounds_start, src, weights0, weights1, shuffle_v, store); - jx += 8; - } - - while jx + 4 < bounds_size { - let bounds_start = bounds.start + jx; - let w_ptr = weights.get_unchecked(jx..(jx + 4)); - let weights = vqtbl1q_s8( - vreinterpretq_s8_s32(vld1q_dup_s32(w_ptr.as_ptr() as *const _)), - weights_shuffle, - ); - store = conv_horiz_rgba_4_u8(bounds_start, src, weights, shuffle_v, store); - jx += 4; - } - - while jx + 2 < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 2)); - let bounds_start = bounds.start + jx; - let v_weight = vqtbl1q_s8( - vreinterpretq_s8_s16(vld1q_dup_s16(w_ptr.as_ptr() as *const _)), - weights_shuffle, - ); - store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, shuffle_v, store); - jx += 2; - } - - while jx < bounds_size { - let w_ptr = weights.get_unchecked(jx..(jx + 1)); - let weight0 = vqtbl1q_s8(vld1q_dup_s8(w_ptr.as_ptr()), weights_shuffle); - let bnds = bounds.start + jx; - store = conv_horiz_rgba_1_u8(bnds, src, weight0, shuffle_v, store); - jx += 1; - } - - write_accumulator_u8(store, dst); - } -} diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index b6257b6..5f541fc 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -55,65 +55,6 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { pool: &Option, ) { let _scale_factor = self.width as f32 / destination.width as f32; - #[cfg(all( - feature = "nightly_i8mm", - target_arch = "aarch64", - target_feature = "neon" - ))] - { - if _scale_factor <= 2. && std::arch::is_aarch64_feature_detected!("i8mm") { - use crate::filter_weights::WeightsConverterQ7; - use crate::neon::{ - convolve_horizontal_rgb_neon_dot_row_one, - convolve_horizontal_rgb_neon_dot_rows_4, - }; - let _dispatcher_4_rows: Option< - fn(&[u8], usize, &mut [u8], usize, &FilterWeights), - > = Some(convolve_horizontal_rgb_neon_dot_rows_4); - let _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights) = - convolve_horizontal_rgb_neon_dot_row_one; - convolve_horizontal_dispatch_u8( - self, - filter_weights, - destination, - pool, - _dispatcher_4_rows, - _dispatcher_1_row, - WeightsConverterQ7::default(), - ); - return; - } - } - #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - feature = "nightly_avx512" - ))] - { - // Precision is too low without vnni - let has_vnni = std::arch::is_x86_feature_detected!("avxvnni"); - if _scale_factor <= 2. && has_vnni { - use crate::avx2::{ - convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8, - }; - use crate::filter_weights::WeightsConverterQ7; - let _dispatcher_4_rows: Option< - fn(&[u8], usize, &mut [u8], usize, &FilterWeights), - > = Some(convolve_horizontal_rgb_avx_rows_4_i8); - let _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights) = - convolve_horizontal_rgb_avx_row_i8_one; - convolve_horizontal_dispatch_u8( - self, - filter_weights, - destination, - pool, - _dispatcher_4_rows, - _dispatcher_1_row, - WeightsConverterQ7::default(), - ); - return; - } - } - let mut _dispatcher_4_rows: Option< fn(&[u8], usize, &mut [u8], usize, &FilterWeights), > = Some(handle_fixed_rows_4_u8::<3>); From f253e3a6ae62ff0ae64ad0bd789c3cb25d210cff Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 10 Jan 2025 08:59:23 +0000 Subject: [PATCH 2/3] Dropped i8mm --- Cargo.toml | 3 ++- app/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 94e9301..43dcf51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,4 +29,5 @@ libc = "0.2.158" default = ["colorspaces"] colorspaces = ["dep:colorutils-rs"] nightly_avx512 = [] -nightly_avx512fp16 = ["nightly_avx512"] \ No newline at end of file +nightly_avx512fp16 = ["nightly_avx512"] +nightly_i8mm = [] \ No newline at end of file diff --git a/app/Cargo.toml b/app/Cargo.toml index 71f0b9f..1c09c6d 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true } +pic-scale = { path = "..", features = ["half"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } From d2028bc6540b592cdd35f1f260eb69ddfaa6c137 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 10 Jan 2025 08:59:44 +0000 Subject: [PATCH 3/3] Dropped i8mm --- .github/workflows/build_push.yml | 1 - Cargo.toml | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index 315e7b6..873aea2 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -25,7 +25,6 @@ jobs: - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu riscv64gc-unknown-linux-gnu - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu --features half - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half - - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half,nightly_i8mm - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu - run: cargo build --target powerpc-unknown-linux-gnu - run: cargo build --target riscv64gc-unknown-linux-gnu diff --git a/Cargo.toml b/Cargo.toml index 43dcf51..94e9301 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,5 +29,4 @@ libc = "0.2.158" default = ["colorspaces"] colorspaces = ["dep:colorutils-rs"] nightly_avx512 = [] -nightly_avx512fp16 = ["nightly_avx512"] -nightly_i8mm = [] \ No newline at end of file +nightly_avx512fp16 = ["nightly_avx512"] \ No newline at end of file