Skip to content

Commit

Permalink
Merge pull request #42 from awxkee/dev
Browse files Browse the repository at this point in the history
F16/F32 small improvements
  • Loading branch information
awxkee authored Jan 12, 2025
2 parents aa906ac + bce9b05 commit 9dd1283
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 38 deletions.
28 changes: 0 additions & 28 deletions src/neon/f16_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,34 +579,6 @@ pub(super) unsafe fn xvfmlaq_f16(
xreinterpretq_f16_u16(result)
}

// #[cfg(all(target_arch = "aarch64", target_feature = "fhm"))]
// #[inline]
// pub(super) unsafe fn p_xvmlaq_f16(
// a: x_float16x8_t,
// b: x_float16x8_t,
// c: x_float16x8_t,
// ) -> x_float16x8_t {
// xvfmlaq_f16(a, b, c)
// }

// #[inline]
// pub(super) unsafe fn xvmlaq_f16(
// a: x_float16x8_t,
// b: x_float16x8_t,
// c: x_float16x8_t,
// ) -> x_float16x8_t {
// xvaddq_f16(a, xvmulq_f16(b, c))
// }

// #[inline]
// pub(super) unsafe fn xvmla_f16(
// a: x_float16x4_t,
// b: x_float16x4_t,
// c: x_float16x4_t,
// ) -> x_float16x4_t {
// xvadd_f16(a, xvmul_f16(b, c))
// }

/// Floating-point Multiply (vector).
/// This instruction multiplies corresponding floating-point values in the vectors in the two
/// source SIMD&FP registers,
Expand Down
20 changes: 10 additions & 10 deletions src/neon/plane_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
}

let px = x;
let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
dest_ptr.write_unaligned(vaddvq_f32(store));
let dest_ptr = dst.get_unchecked_mut(px);
*dest_ptr = vaddvq_f32(store);

filter_offset += filter_weights.aligned_size;
}
Expand Down Expand Up @@ -279,17 +279,17 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
}

let px = x;
let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
dest_ptr.write_unaligned(vaddvq_f32(store_0));
let dest_ptr0 = dst.get_unchecked_mut(px);
*dest_ptr0 = vaddvq_f32(store_0);

let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
dest_ptr.write_unaligned(vaddvq_f32(store_1));
let dest_ptr1 = dst.get_unchecked_mut(px + dst_stride);
*dest_ptr1 = vaddvq_f32(store_1);

let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
dest_ptr.write_unaligned(vaddvq_f32(store_2));
let dest_ptr2 = dst.get_unchecked_mut(px + dst_stride * 2);
*dest_ptr2 = vaddvq_f32(store_2);

let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
dest_ptr.write_unaligned(vaddvq_f32(store_3));
let dest_ptr3 = dst.get_unchecked_mut(px + dst_stride * 3);
*dest_ptr3 = vaddvq_f32(store_3);

filter_offset += filter_weights.aligned_size;
}
Expand Down

0 comments on commit 9dd1283

Please sign in to comment.