Vertical f16, some bugfixes

awxkee · Jul 25, 2024 · 56cd389 · 56cd389
1 parent a4601ce
commit 56cd389
Show file tree

Hide file tree

Showing 8 changed files with 314 additions and 56 deletions.
diff --git a/app/src/main.rs b/app/src/main.rs
@@ -32,17 +32,17 @@ fn main() {
     scaler.set_threading_policy(ThreadingPolicy::Single);
     let start_time = Instant::now();
 
-    let mut converted_bytes: Vec<f16> = bytes
+    let mut converted_bytes: Vec<f32> = bytes
         .iter()
-        .map(|&x| f16::from_f32(x as f32 / 255f32))
+        .map(|&x| x as f32 / 255f32)
         .collect();
 
-    let store = ImageStore::<f16, 4>::from_slice(
+    let store = ImageStore::<f32, 4>::from_slice(
         &mut converted_bytes,
         dimensions.0 as usize,
         dimensions.1 as usize,
     );
-    let resized = scaler.resize_rgba_f16(
+    let resized = scaler.resize_rgba_f32(
         ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
         store,
         true,
@@ -58,7 +58,7 @@ fn main() {
     let dst: Vec<u8> = resized
         .as_bytes()
         .iter()
-        .map(|&x| (x.to_f32() * 255f32) as u8)
+        .map(|&x| (x * 255f32) as u8)
         .collect();
 
     if resized.channels == 4 {

diff --git a/src/acceleration_feature.rs b/src/acceleration_feature.rs
diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
@@ -57,9 +57,9 @@ pub fn avx_unpremultiply_alpha_rgba_f32(dst: &mut [f32], src: &[f32], width: usi
                 let pixel_offset = offset + px;
                 let src_ptr = src.as_ptr().add(pixel_offset);
                 let rgba0 = _mm256_loadu_ps(src_ptr);
-                let rgba1 = _mm256_loadu_ps(src_ptr.add(4));
-                let rgba2 = _mm256_loadu_ps(src_ptr.add(8));
-                let rgba3 = _mm256_loadu_ps(src_ptr.add(12));
+                let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
+                let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
+                let rgba3 = _mm256_loadu_ps(src_ptr.add(24));
 
                 let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);
 
@@ -71,9 +71,9 @@ pub fn avx_unpremultiply_alpha_rgba_f32(dst: &mut [f32], src: &[f32], width: usi
 
                 let dst_ptr = dst.as_mut_ptr().add(offset + px);
                 _mm256_storeu_ps(dst_ptr, rgba0);
-                _mm256_storeu_ps(dst_ptr.add(4), rgba1);
-                _mm256_storeu_ps(dst_ptr.add(8), rgba2);
-                _mm256_storeu_ps(dst_ptr.add(12), rgba3);
+                _mm256_storeu_ps(dst_ptr.add(8), rgba1);
+                _mm256_storeu_ps(dst_ptr.add(16), rgba2);
+                _mm256_storeu_ps(dst_ptr.add(24), rgba3);
 
                 _cx += 8;
             }

diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
@@ -3,10 +3,13 @@ mod alpha_f16;
 mod alpha_f32;
 mod alpha_u8;
 pub mod utils;
+mod vertical_f16;
 
 #[cfg(target_feature = "f16c")]
 pub use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
 pub use alpha_f32::avx_premultiply_alpha_rgba_f32;
 pub use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
 pub use alpha_u8::avx_premultiply_alpha_rgba;
 pub use alpha_u8::avx_unpremultiply_alpha_rgba;
+#[cfg(target_feature = "f16c")]
+pub use vertical_f16::convolve_vertical_avx_row_f16;
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
@@ -32,6 +32,19 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
+#[cfg(not(target_feature = "fma"))]
+#[inline]
+pub unsafe fn _mm256_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    return _mm256_add_ps(_mm256_mul_ps(b, c), a);
+}
+
+#[cfg(target_feature = "fma")]
+#[inline]
+pub unsafe fn _mm256_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    return _mm256_fmadd_ps(b, c, a);
+}
+
+
 #[inline(always)]
 pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32