Skip to content

Commit

Permalink
Vertical f16, some bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jul 25, 2024
1 parent a4601ce commit 56cd389
Show file tree
Hide file tree
Showing 8 changed files with 314 additions and 56 deletions.
10 changes: 5 additions & 5 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ fn main() {
scaler.set_threading_policy(ThreadingPolicy::Single);
let start_time = Instant::now();

let mut converted_bytes: Vec<f16> = bytes
let mut converted_bytes: Vec<f32> = bytes
.iter()
.map(|&x| f16::from_f32(x as f32 / 255f32))
.map(|&x| x as f32 / 255f32)
.collect();

let store = ImageStore::<f16, 4>::from_slice(
let store = ImageStore::<f32, 4>::from_slice(
&mut converted_bytes,
dimensions.0 as usize,
dimensions.1 as usize,
);
let resized = scaler.resize_rgba_f16(
let resized = scaler.resize_rgba_f32(
ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
store,
true,
Expand All @@ -58,7 +58,7 @@ fn main() {
let dst: Vec<u8> = resized
.as_bytes()
.iter()
.map(|&x| (x.to_f32() * 255f32) as u8)
.map(|&x| (x * 255f32) as u8)
.collect();

if resized.channels == 4 {
Expand Down
43 changes: 0 additions & 43 deletions src/acceleration_feature.rs

This file was deleted.

12 changes: 6 additions & 6 deletions src/avx2/alpha_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ pub fn avx_unpremultiply_alpha_rgba_f32(dst: &mut [f32], src: &[f32], width: usi
let pixel_offset = offset + px;
let src_ptr = src.as_ptr().add(pixel_offset);
let rgba0 = _mm256_loadu_ps(src_ptr);
let rgba1 = _mm256_loadu_ps(src_ptr.add(4));
let rgba2 = _mm256_loadu_ps(src_ptr.add(8));
let rgba3 = _mm256_loadu_ps(src_ptr.add(12));
let rgba1 = _mm256_loadu_ps(src_ptr.add(8));
let rgba2 = _mm256_loadu_ps(src_ptr.add(16));
let rgba3 = _mm256_loadu_ps(src_ptr.add(24));

let (rrr, ggg, bbb, aaa) = avx_deinterleave_rgba_ps(rgba0, rgba1, rgba2, rgba3);

Expand All @@ -71,9 +71,9 @@ pub fn avx_unpremultiply_alpha_rgba_f32(dst: &mut [f32], src: &[f32], width: usi

let dst_ptr = dst.as_mut_ptr().add(offset + px);
_mm256_storeu_ps(dst_ptr, rgba0);
_mm256_storeu_ps(dst_ptr.add(4), rgba1);
_mm256_storeu_ps(dst_ptr.add(8), rgba2);
_mm256_storeu_ps(dst_ptr.add(12), rgba3);
_mm256_storeu_ps(dst_ptr.add(8), rgba1);
_mm256_storeu_ps(dst_ptr.add(16), rgba2);
_mm256_storeu_ps(dst_ptr.add(24), rgba3);

_cx += 8;
}
Expand Down
3 changes: 3 additions & 0 deletions src/avx2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ mod alpha_f16;
mod alpha_f32;
mod alpha_u8;
pub mod utils;
mod vertical_f16;

#[cfg(target_feature = "f16c")]
pub use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};

Check warning on line 9 in src/avx2/mod.rs

View workflow job for this annotation

GitHub Actions / Build

unused imports: `avx_premultiply_alpha_rgba_f16` and `avx_unpremultiply_alpha_rgba_f16`
pub use alpha_f32::avx_premultiply_alpha_rgba_f32;
pub use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
pub use alpha_u8::avx_premultiply_alpha_rgba;
pub use alpha_u8::avx_unpremultiply_alpha_rgba;
#[cfg(target_feature = "f16c")]
pub use vertical_f16::convolve_vertical_avx_row_f16;

Check warning on line 15 in src/avx2/mod.rs

View workflow job for this annotation

GitHub Actions / Build

unused import: `vertical_f16::convolve_vertical_avx_row_f16`
13 changes: 13 additions & 0 deletions src/avx2/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(not(target_feature = "fma"))]
#[inline]
pub unsafe fn _mm256_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
return _mm256_add_ps(_mm256_mul_ps(b, c), a);
}

#[cfg(target_feature = "fma")]
#[inline]
pub unsafe fn _mm256_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
return _mm256_fmadd_ps(b, c, a);
}


#[inline(always)]
pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
((z << 6) | (y << 4) | (x << 2) | w) as i32
Expand Down
Loading

0 comments on commit 56cd389

Please sign in to comment.