Skip to content

Commit

Permalink
Horizontal convolution SSE fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 20, 2024
1 parent 16793ca commit 5468125
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 39 deletions.
8 changes: 4 additions & 4 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn main() {
let dimensions = img.dimensions();
let mut bytes = Vec::from(img.as_bytes());

let mut scaler = XYZScaler::new(ResamplingFunction::Hann);
let mut scaler = XYZScaler::new(ResamplingFunction::Lanczos3);
scaler.set_threading_policy(ThreadingPolicy::Single);
// let store =
// ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
Expand All @@ -39,7 +39,7 @@ fn main() {
let resized = scaler.resize_rgba(
ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
store,
true,
false,
);

let elapsed_time = start_time.elapsed();
Expand Down Expand Up @@ -111,7 +111,7 @@ fn main() {
}

fn test_fast_image() {
let img = ImageReader::open("./assets/nasa-4928x3279.png")
let img = ImageReader::open("./assets/asset_5.png")
.unwrap()
.decode()
.unwrap();
Expand All @@ -121,7 +121,7 @@ fn test_fast_image() {

let start_time = Instant::now();

let pixel_type: PixelType = PixelType::U8x3;
let pixel_type: PixelType = PixelType::U8x4;

let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();

Expand Down
14 changes: 7 additions & 7 deletions src/scaler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ impl Scaler {
let base_size = (filter_base_size * filter_scale_cutoff).round() as usize;
// Kernel size must be always odd
let kernel_size = base_size * 2 + 1usize;
let filter_radius = base_size as i32;
let filter_radius = base_size as f32;
let filter_scale = 1f32 / filter_scale_cutoff;
let mut weights: Vec<f32> = vec![0f32; kernel_size * out_size];
let mut local_filters = vec![0f32; kernel_size];
Expand All @@ -85,9 +85,8 @@ impl Scaler {
let mut weights_sum: f32 = 0f32;
let mut local_filter_iteration = 0usize;

let start = (center_x - filter_radius as f32).floor().max(0f32) as usize;
let end = ((center_x + filter_radius as f32).ceil().min(in_size as f32) as usize)
.min(start + kernel_size);
let start = (center_x - filter_radius).floor().max(0f32) as usize;
let end = ((center_x + filter_radius).ceil().min(in_size as f32) as usize).min(start + kernel_size);

let center = center_x - 0.5f32;

Expand Down Expand Up @@ -115,6 +114,7 @@ impl Scaler {
};
weight = window * resampling_function(x_kernel_scaled);
} else {
let dx = dx.abs();
weight = resampling_function(dx * filter_scale);
}
weights_sum += weight;
Expand Down Expand Up @@ -161,7 +161,7 @@ impl Scaler {
kernel_size,
kernel_size,
out_size,
filter_radius,
filter_radius as i32,
bounds,
);
}
Expand Down Expand Up @@ -273,14 +273,14 @@ impl Scaling for Scaler {
allocated_store_vertical.resize(store.width * 4 * new_size.height, 0f32);
let mut new_image_vertical =
ImageStore::<f32, 4>::new(allocated_store_vertical, store.width, new_size.height);
let horizontal_filters = self.generate_weights(store.width, new_size.width);
let vertical_filters = self.generate_weights(store.height, new_image_vertical.height);
store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);

let mut allocated_store_horizontal: Vec<f32> = vec![];
allocated_store_horizontal.resize(new_size.width * 4 * new_size.height, 0f32);
let mut new_image_horizontal =
ImageStore::<f32, 4>::new(allocated_store_horizontal, new_size.width, new_size.height);
let horizontal_filters = self.generate_weights(store.width, new_size.width);
new_image_vertical.convolve_horizontal(
horizontal_filters,
&mut new_image_horizontal,
Expand Down Expand Up @@ -326,11 +326,11 @@ impl Scaling for Scaler {
.get_pool(ImageSize::new(new_size.width, new_size.height));

let mut new_image_vertical = ImageStore::<u8, 4>::alloc(src_store.width, new_size.height);
let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
let vertical_filters = self.generate_weights(src_store.height, new_image_vertical.height);
src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);

let mut new_image_horizontal = ImageStore::<u8, 4>::alloc(new_size.width, new_size.height);
let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
new_image_vertical.convolve_horizontal(
horizontal_filters,
&mut new_image_horizontal,
Expand Down
36 changes: 21 additions & 15 deletions src/sse/rgb_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
let filter_start = jx + bounds.start;
store = convolve_horizontal_parts_4_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
Expand All @@ -128,8 +129,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let filter_start = jx + bounds.start;
store = convolve_horizontal_parts_2_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
Expand All @@ -141,8 +143,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
while jx < bounds.size {
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let filter_start = jx + bounds.start;
store = convolve_horizontal_parts_one_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
store,
Expand Down Expand Up @@ -194,8 +197,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
let filter_start = jx + bounds.start;
store_0 = convolve_horizontal_parts_4_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
Expand All @@ -204,7 +208,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
store_0,
);
store_1 = convolve_horizontal_parts_4_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
weight1,
Expand All @@ -213,7 +217,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
store_1,
);
store_2 = convolve_horizontal_parts_4_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
weight1,
Expand All @@ -222,7 +226,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
store_2,
);
store_3 = convolve_horizontal_parts_4_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
weight1,
Expand All @@ -237,29 +241,30 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let filter_start = jx + bounds.start;
store_0 = convolve_horizontal_parts_2_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
store_0,
);
store_1 = convolve_horizontal_parts_2_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
weight1,
store_1,
);
store_2 = convolve_horizontal_parts_2_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
weight1,
store_2,
);
store_3 = convolve_horizontal_parts_2_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
weight1,
Expand All @@ -271,26 +276,27 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
while jx < bounds.size {
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let filter_start = jx + bounds.start;
store_0 = convolve_horizontal_parts_one_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
store_0,
);
store_1 = convolve_horizontal_parts_one_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
store_1,
);
store_2 = convolve_horizontal_parts_one_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
store_2,
);
store_3 = convolve_horizontal_parts_one_rgb_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
store_3,
Expand Down
29 changes: 16 additions & 13 deletions src/sse/rgba_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,16 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
let mut store_1 = zeros;
let mut store_2 = zeros;
let mut store_3 = zeros;

while jx + 4 < bounds.size {
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
let filter_start = jx + bounds.start;

store_0 = convolve_horizontal_parts_4_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
Expand All @@ -164,7 +165,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
store_0,
);
store_1 = convolve_horizontal_parts_4_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
weight1,
Expand All @@ -173,7 +174,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
store_1,
);
store_2 = convolve_horizontal_parts_4_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
weight1,
Expand All @@ -182,7 +183,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
store_2,
);
store_3 = convolve_horizontal_parts_4_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
weight1,
Expand All @@ -197,29 +198,30 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
let ptr = weights_ptr.add(jx + filter_offset);
let weight0 = _mm_set1_ps(ptr.read_unaligned());
let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
let filter_start = jx + bounds.start;
store_0 = convolve_horizontal_parts_2_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
weight1,
store_0,
);
store_1 = convolve_horizontal_parts_2_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
weight1,
store_1,
);
store_2 = convolve_horizontal_parts_2_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
weight1,
store_2,
);
store_3 = convolve_horizontal_parts_2_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
weight1,
Expand All @@ -230,27 +232,28 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(

while jx < bounds.size {
let ptr = weights_ptr.add(jx + filter_offset);
let filter_start = jx + bounds.start;
let weight0 = _mm_set1_ps(ptr.read_unaligned());
store_0 = convolve_horizontal_parts_one_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0,
weight0,
store_0,
);
store_1 = convolve_horizontal_parts_one_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride),
weight0,
store_1,
);
store_2 = convolve_horizontal_parts_one_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 2),
weight0,
store_2,
);
store_3 = convolve_horizontal_parts_one_rgba_f32(
bounds.start,
filter_start,
unsafe_source_ptr_0.add(src_stride * 3),
weight0,
store_3,
Expand Down

0 comments on commit 5468125

Please sign in to comment.