use crate::data::GpuMemoryPointers;
use ocl;
use ocl::core::Int2;
use ocl::{Buffer, Context, Device, Kernel, Program, Queue};
pub fn gui_opencl_ncc_v2(
v2_kernel_fast_pass: &Kernel,
image_integral: &[u64],
squared_image_integral: &[u64],
image_width: u32,
image_height: u32,
template_width: u32,
template_height: u32,
segments_sum_squared_deviation_slow: f32,
segments_mean_slow: f32,
slow_expected_corr: f32,
queue: &Queue,
program: &Program,
gpu_memory_pointers: &GpuMemoryPointers,
slow_segment_count: i32,
remainder_segments_slow: i32,
segments_processed_by_thread_slow: i32,
workgroup_size: i32,
precision: f32,
) -> ocl::Result<Vec<(u32, u32, f32)>> {
gpu_memory_pointers
.buffer_image_integral
.write(image_integral)
.enq()?;
gpu_memory_pointers
.buffer_image_integral_squared
.write(squared_image_integral)
.enq()?;
gpu_memory_pointers
.buffer_precision
.write(&vec![precision])
.enq()?;
unsafe {
v2_kernel_fast_pass.enq()?;
}
let mut valid_corr_count_host = vec![0i32; 1];
gpu_memory_pointers
.buffer_valid_corr_count_fast
.read(&mut valid_corr_count_host)
.enq()?;
let valid_corr_count = valid_corr_count_host[0] as usize;
if valid_corr_count == 0 {
let final_results: Vec<(u32, u32, f32)> = Vec::new();
return Ok(final_results);
}
let new_global_work_size = valid_corr_count * workgroup_size as usize;
let v2_kernel_slow_pass = Kernel::builder()
.program(&program)
.name("v2_segmented_match_integral_slow_pass")
.queue(queue.clone())
.global_work_size(new_global_work_size)
.arg(&gpu_memory_pointers.buffer_image_integral)
.arg(&gpu_memory_pointers.buffer_image_integral_squared)
.arg(&gpu_memory_pointers.segments_slow_buffer)
.arg(&gpu_memory_pointers.segment_slow_values_buffer)
.arg(&slow_segment_count)
.arg(&(segments_mean_slow as f32))
.arg(&(segments_sum_squared_deviation_slow as f32))
.arg(&gpu_memory_pointers.buffer_results_slow_positions_v2)
.arg(&gpu_memory_pointers.buffer_results_slow_corrs_v2)
.arg(&(image_width as i32))
.arg(&(image_height as i32))
.arg(&(template_width as i32))
.arg(&(template_height as i32))
.arg(&(slow_expected_corr as f32))
.arg(&remainder_segments_slow)
.arg(&segments_processed_by_thread_slow)
.arg(&workgroup_size)
.arg_local::<u64>(1) .arg_local::<u64>(1) .arg_local::<u64>(workgroup_size as usize) .arg(&gpu_memory_pointers.buffer_valid_corr_count_slow)
.arg(&gpu_memory_pointers.buffer_valid_corr_count_fast) .arg(&gpu_memory_pointers.buffer_results_fast_v2)
.arg(&gpu_memory_pointers.buffer_precision)
.build()?;
unsafe {
v2_kernel_slow_pass.enq()?;
}
let mut valid_corr_count_host_slow = vec![0i32; 1];
gpu_memory_pointers
.buffer_valid_corr_count_slow
.read(&mut valid_corr_count_host_slow)
.enq()?;
let valid_corr_count_slow = valid_corr_count_host_slow[0] as usize;
if valid_corr_count_slow > 0 {
let mut slow_pass_corrs = vec![0.0; valid_corr_count_slow];
let mut slow_pass_positions = vec![ocl::core::Int2::zero(); valid_corr_count_slow];
gpu_memory_pointers
.buffer_results_slow_positions_v2
.read(&mut slow_pass_positions)
.enq()?;
gpu_memory_pointers
.buffer_results_slow_corrs_v2
.read(&mut slow_pass_corrs)
.enq()?;
gpu_memory_pointers
.buffer_results_slow_corrs_v2
.write(&vec![0.0f32; valid_corr_count_slow])
.enq()?;
gpu_memory_pointers
.buffer_results_slow_positions_v2
.write(&vec![Int2::zero(); valid_corr_count_slow])
.enq()?;
let mut result_vec: Vec<(u32, u32, f32)> = slow_pass_positions
.iter()
.zip(slow_pass_corrs.iter())
.map(|(pos, &corr)| (pos[0] as u32, pos[1] as u32, corr))
.collect();
result_vec.retain(|&(_, _, value)| value >= (slow_expected_corr - 0.01) * precision);
result_vec
.sort_unstable_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
return Ok(result_vec);
} else {
let final_results: Vec<(u32, u32, f32)> = Vec::new();
return Ok(final_results);
}
}