Merge pull request #8 from awxkee/dev

Alpha unpremultiplication fixes
awxkee · Nov 14, 2024 · 8c8da42 · 8c8da42
2 parents e155852 + ff25a01
commit 8c8da42
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 217 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.3"
+version = "0.3.4"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"

diff --git a/app/Cargo.toml b/app/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-image = { version = "0.25.5", features = ["default", "avif-native", "avif"] }
+image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
 pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }

diff --git a/app/src/main.rs b/app/src/main.rs
@@ -17,7 +17,7 @@ use pic_scale::{
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/test_5.avif")
+    let img = ImageReader::open("./assets/beach_horizon.jpg")
         .unwrap()
         .decode()
         .unwrap();

diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
@@ -351,60 +351,57 @@ pub fn avx_unpremultiply_alpha_rgba_u16(
     }
 }
 
-#[inline]
 #[target_feature(enable = "avx2")]
 unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth: usize) {
     let max_colors = (1 << bit_depth) - 1;
 
-    let v_scale_colors = unsafe { _mm256_set1_ps(max_colors as f32) };
+    let v_scale_colors = _mm256_set1_ps(max_colors as f32);
 
     let mut rem = in_place;
 
-    unsafe {
-        for dst in rem.chunks_exact_mut(16 * 4) {
-            let src_ptr = dst.as_ptr();
-            let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-            let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
-            let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-            let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
-
-            let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
+    for dst in rem.chunks_exact_mut(16 * 4) {
+        let src_ptr = dst.as_ptr();
+        let lane0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let lane1 = _mm256_loadu_si256(src_ptr.add(16) as *const __m256i);
+        let lane2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let lane3 = _mm256_loadu_si256(src_ptr.add(48) as *const __m256i);
 
-            let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256());
+        let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-            let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
-                _mm256_castsi256_si128(pixel.3),
-            )));
+        let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256());
 
-            low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors);
+        let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
+            _mm256_castsi256_si128(pixel.3),
+        )));
 
-            let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
-                _mm256_extracti128_si256::<1>(pixel.3),
-            )));
+        low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors);
 
-            high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors);
+        let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
+            _mm256_extracti128_si256::<1>(pixel.3),
+        )));
 
-            let mut new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
-            new_rrr = _mm256_select_si256(is_zero_alpha_mask, pixel.0, new_rrr);
-            let mut new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
-            new_ggg = _mm256_select_si256(is_zero_alpha_mask, pixel.1, new_ggg);
-            let mut new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
-            new_bbb = _mm256_select_si256(is_zero_alpha_mask, pixel.2, new_bbb);
+        high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors);
 
-            let dst_ptr = dst.as_mut_ptr();
-            let (d_lane0, d_lane1, d_lane2, d_lane3) =
-                avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
+        let mut new_rrr = _mm256_scale_by_alpha(pixel.0, low_alpha, high_alpha);
+        new_rrr = _mm256_select_si256(is_zero_alpha_mask, pixel.0, new_rrr);
+        let mut new_ggg = _mm256_scale_by_alpha(pixel.1, low_alpha, high_alpha);
+        new_ggg = _mm256_select_si256(is_zero_alpha_mask, pixel.1, new_ggg);
+        let mut new_bbb = _mm256_scale_by_alpha(pixel.2, low_alpha, high_alpha);
+        new_bbb = _mm256_select_si256(is_zero_alpha_mask, pixel.2, new_bbb);
 
-            _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
-            _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
-            _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
-            _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
-        }
+        let dst_ptr = dst.as_mut_ptr();
+        let (d_lane0, d_lane1, d_lane2, d_lane3) =
+            avx_interleave_rgba_epi16(new_rrr, new_ggg, new_bbb, pixel.3);
 
-        rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, d_lane0);
+        _mm256_storeu_si256(dst_ptr.add(16) as *mut __m256i, d_lane1);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, d_lane2);
+        _mm256_storeu_si256(dst_ptr.add(48) as *mut __m256i, d_lane3);
     }
 
-    unpremultiply_alpha_rgba_row(rem, bit_depth as u32);
+    rem = rem.chunks_exact_mut(16 * 4).into_remainder();
+
+    unpremultiply_alpha_rgba_row(rem, max_colors);
 }
 
 #[inline]

diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs
@@ -26,7 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::alpha_handle_u16::premultiply_alpha_rgba_row;
+use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
@@ -249,16 +249,7 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize)
         rem = rem.chunks_exact_mut(8 * 4).into_remainder();
     }
 
-    for dst in rem.chunks_exact_mut(4) {
-        let a = dst[3] as u32;
-        if a != 0 {
-            let a_recip = 1. / a as f32;
-            dst[0] = ((dst[0] as u32 * max_colors) as f32 * a_recip) as u16;
-            dst[1] = ((dst[1] as u32 * max_colors) as f32 * a_recip) as u16;
-            dst[2] = ((dst[2] as u32 * max_colors) as f32 * a_recip) as u16;
-            dst[3] = ((a * max_colors) as f32 * a_recip) as u16;
-        }
-    }
+    unpremultiply_alpha_rgba_row(rem, max_colors);
 }
 
 pub fn neon_unpremultiply_alpha_rgba_u16(