rustautogui 2.5.0

#[cfg(feature = "opencl")]
pub const OCL_KERNEL: &str = r#"
inline ulong sum_region(
    __global const ulong* integral,
    int x,
    int y,
    int width,
    int height,
    int image_width
) {
    int x2 = x + width - 1;
    int y2 = y + height - 1;

    ulong br = integral[y2 * image_width + x2];
    ulong bl = (x == 0) ? 0 : integral[y2 * image_width + (x - 1)];
    ulong tr = (y == 0) ? 0 : integral[(y - 1) * image_width + x2];
    ulong tl = (x == 0 || y == 0) ? 0 : integral[(y - 1) * image_width + (x - 1)];
    long sum = (long)br + (long)tl - (long)bl - (long)tr;


    return (ulong)sum;
}


inline ulong sum_region_squared(
    __global const ulong* integral_sq,
    int x,
    int y,
    int width,
    int height,
    int image_width
) {
    int x2 = x + width - 1;
    int y2 = y + height - 1;

    ulong br = integral_sq[y2 * image_width + x2];
    ulong bl = (x == 0) ? 0 : integral_sq[y2 * image_width + (x - 1)];
    ulong tr = (y == 0) ? 0 : integral_sq[(y - 1) * image_width + x2];
    ulong tl = (x == 0 || y == 0) ? 0 : integral_sq[(y - 1) * image_width + (x - 1)];
    long sum = (long)br + (long)tl - (long)bl - (long)tr;
    return (ulong)sum;
}


__kernel void segmented_match_integral(
    __global const ulong* integral,
    __global const ulong* integral_sq,
    __global const int4* segments,
    __global const int4* segments_slow,
    __global const float* segment_values,
    __global const float* segment_values_slow,
    const int num_segments,
    const int num_segments_slow,
    const float template_mean,
    const float template_mean_slow,
    const float template_sq_dev,
    const float template_sq_dev_slow,
    __global float* results,
    const int image_width,
    const int image_height,
    const int template_width,
    const int template_height,
    const float min_expected_corr,
    __global float* precision_buff
) {
    float precision = precision_buff[0];
    int idx = get_global_id(0);
    int result_width = image_width - template_width + 1;
    int result_height = image_height - template_height + 1;

    if (idx >= result_width * result_height) return;
    // results[idx] = 0.0;
    int x = idx % result_width;
    int y = idx / result_width;


    ulong patch_sum = sum_region(integral, x, y, template_width, template_height, image_width);
    ulong patch_sq_sum = sum_region_squared(integral_sq, x, y, template_width, template_height, image_width);
    


    float area = (float)(template_width * template_height);
    float mean_img = (float)(patch_sum) / area;
    float var_img = (float)(patch_sq_sum) - ((float)(patch_sum) * (float)(patch_sum)) / area;
    
    float nominator = 0.0f;
    for (int i = 0; i < num_segments; i++) {
        int4 seg = segments[i];
        float seg_val = segment_values[i];
        int seg_area = seg.z * seg.w;

        ulong region_sum = sum_region(integral, x + seg.x, y + seg.y, seg.z, seg.w, image_width);

        nominator += ((float)(region_sum) - mean_img * seg_area) * (seg_val - template_mean);
    }

    float denominator = sqrt(var_img * template_sq_dev);
    
    float corr = (denominator != 0.0f) ? (nominator / denominator) : -1.0f;



    if (corr < (min_expected_corr - 0.001)* precision) {
        results[idx] = corr;
        return;
    } else {
        float denominator_slow = sqrt(var_img * template_sq_dev_slow);
        float nominator_slow = 0.0f;
        for (int i = 0; i < num_segments_slow; i++) {
            int4 seg_slow = segments_slow[i];
            float seg_val_slow = segment_values_slow[i];
            int seg_area = seg_slow.z * seg_slow.w;

            ulong region_sum = sum_region(integral, x + seg_slow.x, y + seg_slow.y, seg_slow.z, seg_slow.w, image_width);

            nominator_slow += ((float)(region_sum) - mean_img * seg_area) * (seg_val_slow - template_mean);
        }
        float corr_slow = (denominator_slow != 0.0f) ? (nominator_slow / denominator_slow) : -1.0f;
        results[idx] = corr_slow;
    }    
}


__kernel void v2_segmented_match_integral_fast_pass(
    __global const ulong* integral,
    __global const ulong* integral_sq,
    __global const int4* segments,
    __global const float* segment_values,
    const int num_segments,
    const float template_mean,
    const float template_sq_dev,
    __global int2* results,
    const int image_width,
    const int image_height,
    const int template_width,
    const int template_height,
    const float min_expected_corr,
    const int remainder_segments_fast,
    const int segments_per_thread_fast,
    const int pixels_per_workgroup,
    const int workgroup_size,
    __local ulong* sum_template_region_buff,
    __local ulong* sum_sq_template_region_buff,
    __local float* thread_segment_sum_buff,
    __global int* valid_corr_count,
    __global float* precision_buff
) {
    int global_id = get_global_id(0);
    int local_id = get_local_id(0);
    int workgroup_id = get_group_id(0);
    int result_w = image_width - template_width;
    if (local_id == 3 && global_id == 2) {
        valid_corr_count[0] == 0;
    }


    // num_segments is also count of threads per pixel for fast img
    if (local_id * segments_per_thread_fast +  remainder_segments_fast >= num_segments * pixels_per_workgroup) return ; // this solves more segments per thread

    int pixel_pos = (workgroup_id * pixels_per_workgroup) + (local_id / num_segments) ;
    int image_x = pixel_pos % result_w;
    int image_y = pixel_pos / result_w;

    // first sum the region of template area for numerator calculations
    // we do it with first threads for each x,y position which workgroup processes
    // if there are 5 pixels processed, local_id 0-4 should process sum regions for each position, 5-9 for squared
    ulong patch_sum = 0;
    if (local_id < pixels_per_workgroup) {
        patch_sum = sum_region(integral, image_x, image_y, template_width, template_height, image_width);
        sum_template_region_buff[local_id] = patch_sum;
        
    }
    
    // there will never be less than 2 segments 
    // meaning pixels per workgroup is never greater than workgroup_size / 2 
    if (local_id >= pixels_per_workgroup && local_id < pixels_per_workgroup * 2) {
        ulong patch_sq_sum = sum_region_squared(integral_sq, image_x, image_y, template_width, template_height, image_width);
        sum_sq_template_region_buff[local_id % pixels_per_workgroup] = patch_sq_sum;
    }
    
    int result_width = image_width - template_width + 1;
    int result_height = image_height - template_height + 1;
    float area = (float)(template_width * template_height);

    // wait  for threads to complete writing sum_area
    barrier(CLK_LOCAL_MEM_FENCE);

    
    float mean_img = (float)(sum_template_region_buff[local_id / num_segments]) / area;


    // this is to cover if we have more than 1 segment per thread. This method 
    // with remainder allows us to keep all threads working
    int remainder_offset = 0;
    int remainder_addition = 0;
    if (remainder_segments_fast > 0) {
        if (local_id >= remainder_segments_fast) {
            remainder_offset = remainder_segments_fast;
        } else {
            remainder_offset = local_id;
            remainder_addition = 1; 
        }
    
    }

    
    
    // AUDIT - DOUBLE CHECK THIS LOGIC
    int thread_segment_start = (local_id * segments_per_thread_fast + remainder_offset ) % num_segments;
    int thread_segment_end = thread_segment_start +  segments_per_thread_fast + remainder_addition;

    float nominator = 0.0f;
    for (int i = thread_segment_start; i< thread_segment_end; i++) {
        
        int4 seg = segments[i];
        float seg_val = segment_values[i];
        int seg_area = seg.z* seg.w;
        ulong region_sum = sum_region(integral, image_x + seg.x, image_y + seg.y, seg.z, seg.w, image_width);
        

        nominator += ((float)(region_sum) - mean_img * seg_area) * (seg_val - template_mean);

    }
    
    thread_segment_sum_buff[local_id] = nominator;

    barrier(CLK_LOCAL_MEM_FENCE);


    
    if (local_id < pixels_per_workgroup) {
        float nominator_sum = 0.0f;
        int sum_start = local_id * num_segments;
        int sum_end = sum_start + (num_segments / segments_per_thread_fast ) - (remainder_segments_fast/segments_per_thread_fast);
        for (int i = sum_start; i< sum_end; i++) {
            nominator_sum = nominator_sum + thread_segment_sum_buff[i] ;
        }

        int pixel_pos_final = (workgroup_id * pixels_per_workgroup) + (local_id) ;
        int image_x = pixel_pos_final % result_w;
        int image_y = pixel_pos_final / result_w;

        float precision = precision_buff[0];
        ulong patch_sq_sum_extracted = sum_sq_template_region_buff[local_id];
        float var_img = (float)patch_sq_sum_extracted - ((float)patch_sum * (float)patch_sum)/ (float)area;
        float denominator = sqrt(var_img * (float)template_sq_dev);
        float corr = (denominator != 0.0f) ? (nominator_sum / denominator) : -1.0f;        
        
        if (corr >= (min_expected_corr - 0.01) * precision && corr < 2) {
        
            int index = atomic_add(valid_corr_count, 1);
            results[index] = (int2)(image_x, image_y);
            
        }
    } 
}



__kernel void v2_segmented_match_integral_slow_pass (
    __global const ulong* integral,
    __global const ulong* integral_sq,
    __global const int4* segments,
    __global const float* segment_values,
    const int num_segments,
    const float template_mean,
    const float template_sq_dev,
    __global int2* position_results,
    __global float* corr_results,
    const int image_width,
    const int image_height,
    const int template_width,
    const int template_height,
    const float min_expected_corr,
    const int remainder_segments_slow,
    const int segments_per_thread_slow,
    const int workgroup_size,
    __local ulong* sum_template_region_buff,
    __local ulong* sum_sq_template_region_buff,
    __local float* thread_segment_sum_buff,
    __global int* valid_corr_count_slow,
    __global int* valid_corr_count_fast,
    __global int2* fast_pass_results,
    __global float* precision_buff
) {
    
    int global_id = get_global_id(0);
    int local_id = get_local_id(0);
    int workgroup_id = get_group_id(0);

    
    
    

    int image_x = fast_pass_results[workgroup_id].x;
    int image_y = fast_pass_results[workgroup_id].y;

    int result_w = image_width - template_width;
    // num_segments is also count of threads per pixel for fast img
    if (local_id * segments_per_thread_slow +  remainder_segments_slow >= num_segments) return ; // this solves more segments per thread

    
    // first sum the region of template area for numerator calculations
    // we do it with first threads for each x,y position which workgroup processes
    // if there are 5 pixels processed, local_id 0-4 should process sum regions for each position, 5-9 for squared
    ulong patch_sum = 0;
    if (local_id == 0) {
        patch_sum = sum_region(integral, image_x, image_y, template_width, template_height, image_width);
        sum_template_region_buff[0] = patch_sum;
        
    }
    
    // there will never be less than 2 segments 
    // meaning pixels per workgroup is never greater than workgroup_size / 2 
    if (local_id == 1) {
        ulong patch_sq_sum = sum_region_squared(integral_sq, image_x, image_y, template_width, template_height, image_width);
        sum_sq_template_region_buff[0] = patch_sq_sum;
    }
    int result_width = image_width - template_width + 1;
    int result_height = image_height - template_height + 1;
    float area = (float)(template_width * template_height);
    // wait  for threads to complete writing sum_area
    barrier(CLK_LOCAL_MEM_FENCE);
    float mean_img = (float)(sum_template_region_buff[0]) / area;
    // this is to cover if we have more than 1 segment per thread. This method 
    

    // with remainder allows us to keep all threads working
    int remainder_offset = 0;
    int remainder_addition = 0;
    if (remainder_segments_slow > 0) {
        if (local_id >= remainder_segments_slow) {
            remainder_offset = remainder_segments_slow;
        } else {
            remainder_offset = local_id;
            remainder_addition = 1; 
        }
    
    }

    int thread_segment_start = (local_id * segments_per_thread_slow + remainder_offset ) % num_segments;
    int thread_segment_end = thread_segment_start +  segments_per_thread_slow + remainder_addition;


    float nominator = 0.0f;
    for (int i = thread_segment_start; i< thread_segment_end; i++) {
        
        int4 seg = segments[i];
        float seg_val = segment_values[i];
        int seg_area = seg.z* seg.w;
        ulong region_sum = sum_region(integral, image_x + seg.x, image_y + seg.y, seg.z, seg.w, image_width);
        

        nominator += ((float)(region_sum) - mean_img * seg_area) * (seg_val - template_mean);

    }
    
    thread_segment_sum_buff[local_id] = nominator;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (local_id == 0) {
        float nominator_sum = 0.0f;
        int sum_start = 0;
        int sum_end = sum_start + (num_segments / segments_per_thread_slow ) - (remainder_segments_slow/segments_per_thread_slow);
        for (int i = sum_start; i< sum_end; i++) {
            nominator_sum = nominator_sum + thread_segment_sum_buff[i] ;
        }

        


        ulong patch_sq_sum_extracted = sum_sq_template_region_buff[0];
        float var_img = (float)patch_sq_sum_extracted - ((float)patch_sum * (float)patch_sum)/ (float)area;
        float denominator = sqrt(var_img * (float)template_sq_dev);
        float corr = (denominator != 0.0f) ? (nominator_sum / denominator) : -1.0f;        
        float precision = precision_buff[0];
        
        if (corr >= (min_expected_corr - 0.001) * precision  && corr < 2) {
            int index = atomic_add(valid_corr_count_slow, 1);
            position_results[index] = (int2)(image_x, image_y);
            corr_results[index] = corr;
        }
    } 
}


"#;