Skip to content

Commit

Permalink
Merge pull request #8 from awxkee/dev
Browse files Browse the repository at this point in the history
Alpha unpremultiplication fixes
  • Loading branch information
awxkee authored Nov 14, 2024
2 parents e155852 + ff25a01 commit 8c8da42
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 217 deletions.
127 changes: 3 additions & 124 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm"] }

[package]
name = "pic-scale"
version = "0.3.3"
version = "0.3.4"
edition = "2021"
description = "High performance image scaling"
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion app/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
image = { version = "0.25.5", features = ["default", "avif-native", "avif"] }
image = { version = "0.25.5", features = ["default"] }
#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
pic-scale = { path = "..", features = ["half"], default-features = true }
fast_image_resize = { version = "5.0.0", features = [] }
Expand Down
2 changes: 1 addition & 1 deletion app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use pic_scale::{

fn main() {
// test_fast_image();
let img = ImageReader::open("./assets/test_5.avif")
let img = ImageReader::open("./assets/beach_horizon.jpg")
.unwrap()
.decode()
.unwrap();
Expand Down
69 changes: 33 additions & 36 deletions src/avx2/alpha_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,60 +351,57 @@ pub fn avx_unpremultiply_alpha_rgba_u16(
}
}

#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth: usize) {
let max_colors = (1 << bit_depth) - 1;

let v_scale_colors = unsafe { _mm256_set1_ps(max_colors as f32) };
let v_scale_colors = _mm256_set1_ps(max_colors as f32);

let mut rem = in_place;

unsafe {
for dst in rem.chunks_exact_mut(16 * 4) {
let src_ptr = dst.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);

let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
for dst in rem.chunks_exact_mut(16 * 4) {
let src_ptr = dst.as_ptr();
let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);

let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256());
let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);

let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
_mm256_castsi256_si128(pixel.3),
)));
let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256());

low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors);
let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
_mm256_castsi256_si128(pixel.3),
)));

let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
_mm256_extracti128_si256::<1>(pixel.3),
)));
low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors);

high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors);
let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
_mm256_extracti128_si256::<1>(pixel.3),
)));

let mut new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
new_rrr = _mm256_select_si256(is_zero_alpha_mask, pixel.0, new_rrr);
let mut new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
new_ggg = _mm256_select_si256(is_zero_alpha_mask, pixel.1, new_ggg);
let mut new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
new_bbb = _mm256_select_si256(is_zero_alpha_mask, pixel.2, new_bbb);
high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors);

let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
let mut new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
new_rrr = _mm256_select_si256(is_zero_alpha_mask, pixel.0, new_rrr);
let mut new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
new_ggg = _mm256_select_si256(is_zero_alpha_mask, pixel.1, new_ggg);
let mut new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
new_bbb = _mm256_select_si256(is_zero_alpha_mask, pixel.2, new_bbb);

_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}
let dst_ptr = dst.as_mut_ptr();
let (d_lane0, d_lane1, d_lane2, d_lane3) =
avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);

rem = rem.chunks_exact_mut(16 * 4).into_remainder();
_mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
_mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
_mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
_mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
}

unpremultiply_alpha_rgba_row(rem, bit_depth as u32);
rem = rem.chunks_exact_mut(16 * 4).into_remainder();

unpremultiply_alpha_rgba_row(rem, max_colors);
}

#[inline]
Expand Down
13 changes: 2 additions & 11 deletions src/neon/alpha_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::alpha_handle_u16::premultiply_alpha_rgba_row;
use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row};
use rayon::iter::{IndexedParallelIterator, ParallelIterator};
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
use rayon::ThreadPool;
Expand Down Expand Up @@ -249,16 +249,7 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize)
rem = rem.chunks_exact_mut(8 * 4).into_remainder();
}

for dst in rem.chunks_exact_mut(4) {
let a = dst[3] as u32;
if a != 0 {
let a_recip = 1. / a as f32;
dst[0] = ((dst[0] as u32 * max_colors) as f32 * a_recip) as u16;
dst[1] = ((dst[1] as u32 * max_colors) as f32 * a_recip) as u16;
dst[2] = ((dst[2] as u32 * max_colors) as f32 * a_recip) as u16;
dst[3] = ((a * max_colors) as f32 * a_recip) as u16;
}
}
unpremultiply_alpha_rgba_row(rem, max_colors);
}

pub fn neon_unpremultiply_alpha_rgba_u16(
Expand Down
Loading

0 comments on commit 8c8da42

Please sign in to comment.