wagahai_lut 0.1.0

/*
 * SPDX-FileCopyrightText: © 2026 Jinwoo Park (pmnxis@gmail.com)
 *
 * SPDX-License-Identifier: MIT
 */

//! 3D LUT image processing

use super::common::{rgb_to_u8, Rgb};
use super::Lut3D;
use crate::error::Result;
use image::{DynamicImage, GenericImageView, ImageBuffer, Pixel, Rgb as ImageRgb, Rgba};
use wide::f32x4;

/// Apply 3D LUT to an image
pub fn apply_to_image_3d(
    cube_lut: &crate::lut::CubeLut,
    image: &DynamicImage,
    lut: &Lut3D,
) -> Result<DynamicImage> {
    let (width, height) = image.dimensions();
    let total_pixels = (width * height) as usize;

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    let pixel = image.get_pixel(0, 0);
    let num_channels = pixel.channels().len();

    if num_channels < 3 {
        return Err(crate::CubeError::InvalidChannel);
    }

    // Pre-calculate domain ranges for performance
    let domain_range_r = domain_max[0] - domain_min[0];
    let domain_range_g = domain_max[1] - domain_min[1];
    let domain_range_b = domain_max[2] - domain_min[2];
    let inv_domain_range_r = 1.0 / domain_range_r;
    let inv_domain_range_g = 1.0 / domain_range_g;
    let inv_domain_range_b = 1.0 / domain_range_b;

    let size = lut.size();
    let size_minus_1 = (size - 1) as f32;
    let size_sq = size * size;

    let (r_ptr, g_ptr, b_ptr) = unsafe { lut.channel_pointers() };

    if num_channels == 3 {
        let mut output_img = image::RgbImage::new(width, height);

        let inv_255 = 1.0 / 255.0;

        let pixels = image.as_bytes();
        let output_pixels = output_img.as_mut();

        let mut pixel_idx = 0;
        for _ in 0..total_pixels {
            let r = pixels[pixel_idx] as f32 * inv_255;
            let g = pixels[pixel_idx + 1] as f32 * inv_255;
            let b = pixels[pixel_idx + 2] as f32 * inv_255;
            pixel_idx += 3;

            let normalized_r = (r - domain_min[0]) * inv_domain_range_r;
            let normalized_g = (g - domain_min[1]) * inv_domain_range_g;
            let normalized_b = (b - domain_min[2]) * inv_domain_range_b;

            let output_rgb = apply_3d_lut_soa_inline(
                r_ptr,
                g_ptr,
                b_ptr,
                normalized_r,
                normalized_g,
                normalized_b,
                size_minus_1,
                size,
                size_sq,
            );

            output_pixels[pixel_idx - 3] = (output_rgb[0] * 255.0) as u8;
            output_pixels[pixel_idx - 2] = (output_rgb[1] * 255.0) as u8;
            output_pixels[pixel_idx - 1] = (output_rgb[2] * 255.0) as u8;
        }
        Ok(DynamicImage::ImageRgb8(output_img))
    } else {
        // RGBA (4 channels)
        let mut output_img = image::RgbaImage::new(width, height);

        for y in 0..height {
            for x in 0..width {
                let pixel = image.get_pixel(x, y);
                let channels = pixel.channels();

                let input = [
                    channels[0] as f32 / 255.0,
                    channels[1] as f32 / 255.0,
                    channels[2] as f32 / 255.0,
                ];

                let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
                let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
                let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

                let output_rgb = apply_3d_lut_soa(
                    r_ptr,
                    g_ptr,
                    b_ptr,
                    normalized_r,
                    normalized_g,
                    normalized_b,
                    size,
                    size_minus_1,
                    size_sq,
                );

                let output_u8 = rgb_to_u8(output_rgb);
                let output_pixel =
                    image::Rgba([output_u8[0], output_u8[1], output_u8[2], channels[3]]);

                output_img.put_pixel(x, y, output_pixel);
            }
        }
        Ok(DynamicImage::ImageRgba8(output_img))
    }
}

/// Apply 3D LUT with maximum cache efficiency
#[inline]
#[allow(clippy::too_many_arguments)]
#[allow(clippy::not_unsafe_ptr_arg_deref)]
pub(crate) fn apply_3d_lut_soa_inline(
    r_ptr: *const f32,
    g_ptr: *const f32,
    b_ptr: *const f32,
    r: f32,
    g: f32,
    b: f32,
    size_minus_1: f32,
    size: usize,
    size_sq: usize,
) -> Rgb {
    let r_scaled = r * size_minus_1;
    let g_scaled = g * size_minus_1;
    let b_scaled = b * size_minus_1;

    let r0 = r_scaled as usize;
    let g0 = g_scaled as usize;
    let b0 = b_scaled as usize;
    let r1 = (r0 + 1).min(size - 1);
    let g1 = (g0 + 1).min(size - 1);
    let b1 = (b0 + 1).min(size - 1);

    let r_frac = r_scaled - r0 as f32;
    let g_frac = g_scaled - g0 as f32;
    let b_frac = b_scaled - b0 as f32;

    unsafe {
        let c000_r = *r_ptr.add(r0 + size * g0 + size_sq * b0);
        let c100_r = *r_ptr.add(r1 + size * g0 + size_sq * b0);
        let c010_r = *r_ptr.add(r0 + size * g1 + size_sq * b0);
        let c110_r = *r_ptr.add(r1 + size * g1 + size_sq * b0);
        let c001_r = *r_ptr.add(r0 + size * g0 + size_sq * b1);
        let c101_r = *r_ptr.add(r1 + size * g0 + size_sq * b1);
        let c011_r = *r_ptr.add(r0 + size * g1 + size_sq * b1);
        let c111_r = *r_ptr.add(r1 + size * g1 + size_sq * b1);

        let c000_g = *g_ptr.add(r0 + size * g0 + size_sq * b0);
        let c100_g = *g_ptr.add(r1 + size * g0 + size_sq * b0);
        let c010_g = *g_ptr.add(r0 + size * g1 + size_sq * b0);
        let c110_g = *g_ptr.add(r1 + size * g1 + size_sq * b0);
        let c001_g = *g_ptr.add(r0 + size * g0 + size_sq * b1);
        let c101_g = *g_ptr.add(r1 + size * g0 + size_sq * b1);
        let c011_g = *g_ptr.add(r0 + size * g1 + size_sq * b1);
        let c111_g = *g_ptr.add(r1 + size * g1 + size_sq * b1);

        let c000_b = *b_ptr.add(r0 + size * g0 + size_sq * b0);
        let c100_b = *b_ptr.add(r1 + size * g0 + size_sq * b0);
        let c010_b = *b_ptr.add(r0 + size * g1 + size_sq * b0);
        let c110_b = *b_ptr.add(r1 + size * g1 + size_sq * b0);
        let c001_b = *b_ptr.add(r0 + size * g0 + size_sq * b1);
        let c101_b = *b_ptr.add(r1 + size * g0 + size_sq * b1);
        let c011_b = *b_ptr.add(r0 + size * g1 + size_sq * b1);
        let c111_b = *b_ptr.add(r1 + size * g1 + size_sq * b1);

        let c000 = f32x4::from([c000_r, c000_g, c000_b, 0.0]);
        let c100 = f32x4::from([c100_r, c100_g, c100_b, 0.0]);
        let c010 = f32x4::from([c010_r, c010_g, c010_b, 0.0]);
        let c110 = f32x4::from([c110_r, c110_g, c110_b, 0.0]);
        let c001 = f32x4::from([c001_r, c001_g, c001_b, 0.0]);
        let c101 = f32x4::from([c101_r, c101_g, c101_b, 0.0]);
        let c011 = f32x4::from([c011_r, c011_g, c011_b, 0.0]);
        let c111 = f32x4::from([c111_r, c111_g, c111_b, 0.0]);

        let result = trilinear_interpolate_simd(
            c000, c100, c010, c110, c001, c101, c011, c111, r_frac, g_frac, b_frac,
        );

        let result_array = result.to_array();
        [result_array[0], result_array[1], result_array[2]]
    }
}

/// Apply 3D LUT (non-inline version for RGBA)
#[inline]
#[allow(clippy::too_many_arguments)]
pub(crate) fn apply_3d_lut_soa(
    r_ptr: *const f32,
    g_ptr: *const f32,
    b_ptr: *const f32,
    r: f32,
    g: f32,
    b: f32,
    size: usize,
    size_minus_1: f32,
    size_sq: usize,
) -> Rgb {
    apply_3d_lut_soa_inline(r_ptr, g_ptr, b_ptr, r, g, b, size_minus_1, size, size_sq)
}

/// Apply 3D LUT to ImageBuffer<Rgb<u8>, Vec<u8>>
pub fn apply_to_image_buffer_rgb_unchecked(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<ImageRgb<u8>, Vec<u8>>,
    lut: &Lut3D,
) -> ImageBuffer<ImageRgb<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    let size = lut.size();
    let size_minus_1 = (size - 1) as f32;
    let size_sq = size * size;
    let (r_ptr, g_ptr, b_ptr) = unsafe { lut.channel_pointers() };

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input = [
                pixel[0] as f32 / 255.0,
                pixel[1] as f32 / 255.0,
                pixel[2] as f32 / 255.0,
            ];

            let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
            let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

            let output_rgb = apply_3d_lut_soa(
                r_ptr,
                g_ptr,
                b_ptr,
                normalized_r,
                normalized_g,
                normalized_b,
                size,
                size_minus_1,
                size_sq,
            );
            let output_u8 = rgb_to_u8(output_rgb);

            output_img.put_pixel(x, y, ImageRgb([output_u8[0], output_u8[1], output_u8[2]]));
        }
    }
    output_img
}

/// Apply 3D LUT to ImageBuffer<Rgba<u8>, Vec<u8>>
pub fn apply_to_image_buffer_rgba_unchecked(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<Rgba<u8>, Vec<u8>>,
    lut: &Lut3D,
) -> ImageBuffer<Rgba<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    let size = lut.size();
    let size_minus_1 = (size - 1) as f32;
    let size_sq = size * size;
    let (r_ptr, g_ptr, b_ptr) = unsafe { lut.channel_pointers() };

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input = [
                pixel[0] as f32 / 255.0,
                pixel[1] as f32 / 255.0,
                pixel[2] as f32 / 255.0,
            ];

            let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
            let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

            let output_rgb = apply_3d_lut_soa(
                r_ptr,
                g_ptr,
                b_ptr,
                normalized_r,
                normalized_g,
                normalized_b,
                size,
                size_minus_1,
                size_sq,
            );
            let output_u8 = rgb_to_u8(output_rgb);

            output_img.put_pixel(
                x,
                y,
                Rgba([
                    output_u8[0],
                    output_u8[1],
                    output_u8[2],
                    pixel[3], // Preserve alpha channel
                ]),
            );
        }
    }
    output_img
}

/// Apply 3D LUT to ImageBuffer<Rgb<u8>, Vec<u8>> in-place (zero allocation)
/// This is the fastest possible way to apply a LUT as it:
/// - Processes pixels in linear memory order for cache efficiency
/// - Uses raw byte slice access instead of get_pixel/put_pixel
/// - Pre-computes all constants outside the loop
/// - Avoids any intermediate buffer allocation
#[inline]
pub fn apply_to_image_buffer_rgb_mut(
    cube_lut: &crate::lut::CubeLut,
    image: &mut ImageBuffer<ImageRgb<u8>, Vec<u8>>,
    lut: &Lut3D,
) {
    let (width, height) = image.dimensions();
    let total_pixels = (width * height) as usize;

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    // Pre-calculate inverse domain ranges
    let inv_domain_range_r = 1.0 / (domain_max[0] - domain_min[0]);
    let inv_domain_range_g = 1.0 / (domain_max[1] - domain_min[1]);
    let inv_domain_range_b = 1.0 / (domain_max[2] - domain_min[2]);

    let size = lut.size();
    let size_minus_1 = (size - 1) as f32;
    let size_sq = size * size;
    let (r_ptr, g_ptr, b_ptr) = unsafe { lut.channel_pointers() };

    let inv_255 = 1.0 / 255.0;

    // Get mutable access to raw pixel data
    let pixels = image.as_mut();

    let mut pixel_idx = 0;
    for _ in 0..total_pixels {
        // Read current pixel values
        let r = pixels[pixel_idx] as f32 * inv_255;
        let g = pixels[pixel_idx + 1] as f32 * inv_255;
        let b = pixels[pixel_idx + 2] as f32 * inv_255;

        // Normalize
        let normalized_r = (r - domain_min[0]) * inv_domain_range_r;
        let normalized_g = (g - domain_min[1]) * inv_domain_range_g;
        let normalized_b = (b - domain_min[2]) * inv_domain_range_b;

        // Apply LUT
        let output_rgb = apply_3d_lut_soa_inline(
            r_ptr,
            g_ptr,
            b_ptr,
            normalized_r,
            normalized_g,
            normalized_b,
            size_minus_1,
            size,
            size_sq,
        );

        // Write back directly to same memory location
        pixels[pixel_idx] = (output_rgb[0] * 255.0) as u8;
        pixels[pixel_idx + 1] = (output_rgb[1] * 255.0) as u8;
        pixels[pixel_idx + 2] = (output_rgb[2] * 255.0) as u8;

        pixel_idx += 3;
    }
}

/// Apply 3D LUT to ImageBuffer<Rgba<u8>, Vec<u8>> in-place (zero allocation)
/// Same optimizations as RGB version, preserves alpha channel
#[inline]
pub fn apply_to_image_buffer_rgba_mut(
    cube_lut: &crate::lut::CubeLut,
    image: &mut ImageBuffer<Rgba<u8>, Vec<u8>>,
    lut: &Lut3D,
) {
    let (width, height) = image.dimensions();
    let total_pixels = (width * height) as usize;

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    // Pre-calculate inverse domain ranges
    let inv_domain_range_r = 1.0 / (domain_max[0] - domain_min[0]);
    let inv_domain_range_g = 1.0 / (domain_max[1] - domain_min[1]);
    let inv_domain_range_b = 1.0 / (domain_max[2] - domain_min[2]);

    let size = lut.size();
    let size_minus_1 = (size - 1) as f32;
    let size_sq = size * size;
    let (r_ptr, g_ptr, b_ptr) = unsafe { lut.channel_pointers() };

    let inv_255 = 1.0 / 255.0;

    // Get mutable access to raw pixel data
    let pixels = image.as_mut();

    let mut pixel_idx = 0;
    for _ in 0..total_pixels {
        // Read current pixel values (skip alpha at +3)
        let r = pixels[pixel_idx] as f32 * inv_255;
        let g = pixels[pixel_idx + 1] as f32 * inv_255;
        let b = pixels[pixel_idx + 2] as f32 * inv_255;
        // Alpha at pixels[pixel_idx + 3] is preserved (not touched)

        // Normalize
        let normalized_r = (r - domain_min[0]) * inv_domain_range_r;
        let normalized_g = (g - domain_min[1]) * inv_domain_range_g;
        let normalized_b = (b - domain_min[2]) * inv_domain_range_b;

        // Apply LUT
        let output_rgb = apply_3d_lut_soa_inline(
            r_ptr,
            g_ptr,
            b_ptr,
            normalized_r,
            normalized_g,
            normalized_b,
            size_minus_1,
            size,
            size_sq,
        );

        // Write back directly to same memory location (alpha unchanged)
        pixels[pixel_idx] = (output_rgb[0] * 255.0) as u8;
        pixels[pixel_idx + 1] = (output_rgb[1] * 255.0) as u8;
        pixels[pixel_idx + 2] = (output_rgb[2] * 255.0) as u8;
        // pixels[pixel_idx + 3] stays unchanged (alpha preserved)

        pixel_idx += 4;
    }
}

/// SIMD trilinear interpolation
#[inline]
#[allow(clippy::too_many_arguments)]
fn trilinear_interpolate_simd(
    c000: f32x4,
    c100: f32x4,
    c010: f32x4,
    c110: f32x4,
    c001: f32x4,
    c101: f32x4,
    c011: f32x4,
    c111: f32x4,
    x: f32,
    y: f32,
    z: f32,
) -> f32x4 {
    let x_comp = 1.0 - x;
    let y_comp = 1.0 - y;
    let z_comp = 1.0 - z;

    let c00 = c000 * x_comp + c100 * x;
    let c01 = c001 * x_comp + c101 * x;
    let c10 = c010 * x_comp + c110 * x;
    let c11 = c011 * x_comp + c111 * x;

    let c0 = c00 * y_comp + c10 * y;
    let c1 = c01 * y_comp + c11 * y;

    c0 * z_comp + c1 * z
}