wagahai_lut 0.1.0

/*
 * SPDX-FileCopyrightText: © 2026 Jinwoo Park (pmnxis@gmail.com)
 *
 * SPDX-License-Identifier: MIT
 */

//! 1D LUT image processing logic

use super::common::Rgb;
use super::lut_1d::Lut1D;
use crate::error::Result;
use image::{
    DynamicImage, GenericImageView, ImageBuffer, Luma, LumaA, Pixel, Rgb as ImageRgb, Rgba,
};

/// Apply 1D LUT to an image (optimized version without per-pixel branching)
/// Channel count is checked once, then different optimized loops are used
pub fn apply_to_image_1d(
    cube_lut: &crate::lut::CubeLut,
    image: &DynamicImage,
    lut: &Lut1D,
) -> Result<DynamicImage> {
    let (width, height) = image.dimensions();

    // Check channel count once at the beginning
    let pixel = image.get_pixel(0, 0);
    let num_channels = pixel.channels().len();

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    // Use different optimized loops based on channel count
    match num_channels {
        1 => {
            // Grayscale (1 channel) - create new grayscale output
            let mut output_img = image::GrayImage::new(width, height);
            for y in 0..height {
                for x in 0..width {
                    let pixel = image.get_pixel(x, y);
                    let channels = pixel.channels();

                    let input = channels[0] as f32 / 255.0;
                    let normalized = (input - domain_min[0]) / (domain_max[0] - domain_min[0]);
                    let output_val = interpolate_1d_simd(lut, normalized, 0);

                    output_img.put_pixel(
                        x,
                        y,
                        image::Luma([(output_val.clamp(0.0, 1.0) * 255.0) as u8]),
                    );
                }
            }
            Ok(DynamicImage::ImageLuma8(output_img))
        }
        2 => {
            // 2-channel image - create new grayscale+alpha output
            let mut output_img = image::GrayAlphaImage::new(width, height);
            for y in 0..height {
                for x in 0..width {
                    let pixel = image.get_pixel(x, y);
                    let channels = pixel.channels();

                    let input0 = channels[0] as f32 / 255.0;
                    let input1 = channels[1] as f32 / 255.0;

                    let normalized0 = (input0 - domain_min[0]) / (domain_max[0] - domain_min[0]);
                    let normalized1 = (input1 - domain_min[1]) / (domain_max[1] - domain_min[1]);

                    let output0 = interpolate_1d_simd(lut, normalized0, 0);
                    let output1 = interpolate_1d_simd(lut, normalized1, 1);

                    output_img.put_pixel(
                        x,
                        y,
                        image::LumaA([
                            (output0.clamp(0.0, 1.0) * 255.0) as u8,
                            (output1.clamp(0.0, 1.0) * 255.0) as u8,
                        ]),
                    );
                }
            }
            Ok(DynamicImage::ImageLumaA8(output_img))
        }
        3 => {
            // RGB (3 channels) - most common case
            let mut output_img = image::RgbaImage::new(width, height);
            for y in 0..height {
                for x in 0..width {
                    let pixel = image.get_pixel(x, y);
                    let channels = pixel.channels();

                    let input = [
                        channels[0] as f32 / 255.0,
                        channels[1] as f32 / 255.0,
                        channels[2] as f32 / 255.0,
                    ];

                    // Normalize inline to avoid function call overhead
                    let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
                    let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
                    let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

                    // Apply LUT directly using known 1D LUT
                    let output_rgb =
                        apply_1d_lut_simd(lut, normalized_r, normalized_g, normalized_b);

                    let output_pixel = image::Rgba([
                        (output_rgb[0].clamp(0.0, 1.0) * 255.0) as u8,
                        (output_rgb[1].clamp(0.0, 1.0) * 255.0) as u8,
                        (output_rgb[2].clamp(0.0, 1.0) * 255.0) as u8,
                        if channels.len() >= 4 {
                            channels[3]
                        } else {
                            255
                        },
                    ]);

                    output_img.put_pixel(x, y, output_pixel);
                }
            }
            Ok(DynamicImage::ImageRgba8(output_img))
        }
        4 => {
            // RGBA (4 channels) - apply LUT to all channels including alpha
            let mut output_img = image::RgbaImage::new(width, height);
            for y in 0..height {
                for x in 0..width {
                    let pixel = image.get_pixel(x, y);
                    let channels = pixel.channels();

                    let input = [
                        channels[0] as f32 / 255.0,
                        channels[1] as f32 / 255.0,
                        channels[2] as f32 / 255.0,
                        channels[3] as f32 / 255.0,
                    ];

                    // Normalize all 4 channels
                    let normalized0 = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
                    let normalized1 = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
                    let normalized2 = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);
                    let normalized3 = (input[3] - domain_min[0]) / (domain_max[0] - domain_min[0]);

                    // Apply LUT to all channels (reuse R channel for Alpha if needed)
                    let output0 = interpolate_1d_simd(lut, normalized0, 0);
                    let output1 = interpolate_1d_simd(lut, normalized1, 1);
                    let output2 = interpolate_1d_simd(lut, normalized2, 2);
                    let output3 = interpolate_1d_simd(lut, normalized3, 0);

                    let output_pixel = image::Rgba([
                        (output0.clamp(0.0, 1.0) * 255.0) as u8,
                        (output1.clamp(0.0, 1.0) * 255.0) as u8,
                        (output2.clamp(0.0, 1.0) * 255.0) as u8,
                        (output3.clamp(0.0, 1.0) * 255.0) as u8,
                    ]);

                    output_img.put_pixel(x, y, output_pixel);
                }
            }
            Ok(DynamicImage::ImageRgba8(output_img))
        }
        _ => {
            // Fallback for other channel counts - preserve all channels
            let mut output_img = image::RgbaImage::new(width, height);
            for y in 0..height {
                for x in 0..width {
                    let pixel = image.get_pixel(x, y);
                    let channels = pixel.channels();

                    let mut output_channels = channels.to_vec();
                    for c in 0..num_channels.min(3) {
                        let input = channels[c] as f32 / 255.0;
                        let channel_idx = c % 3; // Cycle through R, G, B for channels > 2
                        let normalized = (input - domain_min[channel_idx])
                            / (domain_max[channel_idx] - domain_min[channel_idx]);
                        let output_val = interpolate_1d_simd(lut, normalized, channel_idx);
                        output_channels[c] = (output_val.clamp(0.0, 1.0) * 255.0) as u8;
                    }

                    // Preserve alpha if present and beyond 3 channels
                    if num_channels >= 4 {
                        output_channels[3] = channels[3];
                    }

                    // Reconstruct pixel with correct format
                    match num_channels {
                        1 => output_img.put_pixel(
                            x,
                            y,
                            image::Rgba([
                                output_channels[0],
                                output_channels[0],
                                output_channels[0],
                                255,
                            ]),
                        ),
                        2 => output_img.put_pixel(
                            x,
                            y,
                            image::Rgba([output_channels[0], output_channels[1], 0, 255]),
                        ),
                        3 => output_img.put_pixel(
                            x,
                            y,
                            image::Rgba([
                                output_channels[0],
                                output_channels[1],
                                output_channels[2],
                                255,
                            ]),
                        ),
                        _ => output_img.put_pixel(
                            x,
                            y,
                            image::Rgba([
                                output_channels[0],
                                output_channels[1],
                                output_channels[2],
                                output_channels[3],
                            ]),
                        ),
                    }
                }
            }
            Ok(DynamicImage::ImageRgba8(output_img))
        }
    }
}

/// Apply 1D LUT using SIMD-optimized linear interpolation
pub fn apply_1d_lut_simd(lut: &Lut1D, r: f32, g: f32, b: f32) -> Rgb {
    let out_r = interpolate_1d_simd(lut, r, 0);
    let out_g = interpolate_1d_simd(lut, g, 1);
    let out_b = interpolate_1d_simd(lut, b, 2);
    [out_r, out_g, out_b]
}

/// SIMD-optimized linear interpolation for 1D LUT
/// Uses unchecked access for maximum performance - channel is guaranteed to be 0, 1, or 2
#[inline]
pub fn interpolate_1d_simd(lut: &Lut1D, value: f32, channel: usize) -> f32 {
    let size = lut.size();
    // Branchless clamping using min/max
    let value = value.clamp(0.0, 1.0);

    // Calculate index and fractional part
    let float_idx = value * (size - 1) as f32;
    let idx0 = float_idx.floor() as usize;
    let idx1 = (idx0 + 1).min(size - 1);
    let fraction = float_idx - idx0 as f32;

    // Get values from LUT using unchecked access
    // SAFETY:
    // - idx0 is validated by clamping value to [0, 1] and using floor
    // - idx1 is validated by .min(size - 1)
    // - channel is guaranteed to be 0, 1, or 2 from apply_1d_lut_simd
    let val0 = unsafe { lut.get_unchecked(idx0, channel) };
    let val1 = unsafe { lut.get_unchecked(idx1, channel) };

    // FMA-friendly form: val0 + (val1 - val0) * fraction
    val0 + (val1 - val0) * fraction
}

/// Test version of 1D LUT application using scalar implementation
#[cfg(test)]
pub fn __normal_test_apply_1d_lut(lut: &Lut1D, r: f32, g: f32, b: f32) -> Rgb {
    let out_r = __normal_test_interpolate_1d(lut, r, 0);
    let out_g = __normal_test_interpolate_1d(lut, g, 1);
    let out_b = __normal_test_interpolate_1d(lut, b, 2);
    [out_r, out_g, out_b]
}

/// Test version of scalar linear interpolation for 1D LUT
#[cfg(test)]
#[inline]
pub fn __normal_test_interpolate_1d(lut: &Lut1D, value: f32, channel: usize) -> f32 {
    let size = lut.size();
    // Clamp to [0, 1]
    let value = value.clamp(0.0, 1.0);

    // Calculate index and fractional part
    let float_idx = value * (size - 1) as f32;
    let idx0 = float_idx.floor() as usize;
    let idx1 = (idx0 + 1).min(size - 1);
    let fraction = float_idx - idx0 as f32;

    // Get values from LUT
    let val0 = lut.get(idx0, channel).unwrap_or(0.0);
    let val1 = lut.get(idx1, channel).unwrap_or(0.0);

    // Linear interpolation
    val0 * (1.0 - fraction) + val1 * fraction
}

/// Apply 1D LUT to ImageBuffer<Luma<u8>, Vec<u8>>
pub fn apply_to_image_buffer_luma(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<Luma<u8>, Vec<u8>>,
    lut: &Lut1D,
) -> ImageBuffer<Luma<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input = pixel[0] as f32 / 255.0;
            let normalized = (input - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let output_val = interpolate_1d_simd(lut, normalized, 0);
            output_img.put_pixel(x, y, Luma([(output_val.clamp(0.0, 1.0) * 255.0) as u8]));
        }
    }
    output_img
}

/// Apply 1D LUT to ImageBuffer<LumaA<u8>, Vec<u8>>
pub fn apply_to_image_buffer_luma_a(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<LumaA<u8>, Vec<u8>>,
    lut: &Lut1D,
) -> ImageBuffer<LumaA<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input0 = pixel[0] as f32 / 255.0;
            let input1 = pixel[1] as f32 / 255.0;

            let normalized0 = (input0 - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let normalized1 = (input1 - domain_min[1]) / (domain_max[1] - domain_min[1]);

            let output0 = interpolate_1d_simd(lut, normalized0, 0);
            let output1 = interpolate_1d_simd(lut, normalized1, 1);

            output_img.put_pixel(
                x,
                y,
                LumaA([
                    (output0.clamp(0.0, 1.0) * 255.0) as u8,
                    (output1.clamp(0.0, 1.0) * 255.0) as u8,
                ]),
            );
        }
    }
    output_img
}

/// Apply 1D LUT to ImageBuffer<Rgb<u8>, Vec<u8>>
pub fn apply_to_image_buffer_rgb(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<ImageRgb<u8>, Vec<u8>>,
    lut: &Lut1D,
) -> ImageBuffer<ImageRgb<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input = [
                pixel[0] as f32 / 255.0,
                pixel[1] as f32 / 255.0,
                pixel[2] as f32 / 255.0,
            ];

            let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
            let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

            let output_rgb = apply_1d_lut_simd(lut, normalized_r, normalized_g, normalized_b);

            output_img.put_pixel(
                x,
                y,
                ImageRgb([
                    (output_rgb[0].clamp(0.0, 1.0) * 255.0) as u8,
                    (output_rgb[1].clamp(0.0, 1.0) * 255.0) as u8,
                    (output_rgb[2].clamp(0.0, 1.0) * 255.0) as u8,
                ]),
            );
        }
    }
    output_img
}

/// Apply 1D LUT to ImageBuffer<Rgba<u8>, Vec<u8>>
pub fn apply_to_image_buffer_rgba(
    cube_lut: &crate::lut::CubeLut,
    image: &ImageBuffer<Rgba<u8>, Vec<u8>>,
    lut: &Lut1D,
) -> ImageBuffer<Rgba<u8>, Vec<u8>> {
    let (width, height) = image.dimensions();
    let mut output_img = ImageBuffer::new(width, height);

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    for y in 0..height {
        for x in 0..width {
            let pixel = image.get_pixel(x, y);
            let input = [
                pixel[0] as f32 / 255.0,
                pixel[1] as f32 / 255.0,
                pixel[2] as f32 / 255.0,
            ];

            let normalized_r = (input[0] - domain_min[0]) / (domain_max[0] - domain_min[0]);
            let normalized_g = (input[1] - domain_min[1]) / (domain_max[1] - domain_min[1]);
            let normalized_b = (input[2] - domain_min[2]) / (domain_max[2] - domain_min[2]);

            let output_rgb = apply_1d_lut_simd(lut, normalized_r, normalized_g, normalized_b);

            output_img.put_pixel(
                x,
                y,
                Rgba([
                    (output_rgb[0].clamp(0.0, 1.0) * 255.0) as u8,
                    (output_rgb[1].clamp(0.0, 1.0) * 255.0) as u8,
                    (output_rgb[2].clamp(0.0, 1.0) * 255.0) as u8,
                    pixel[3], // Preserve alpha channel
                ]),
            );
        }
    }
    output_img
}

/// Apply 1D LUT to ImageBuffer<Rgb<u8>, Vec<u8>> in-place (zero allocation)
/// This is the fastest possible way to apply a 1D LUT as it:
/// - Processes pixels in linear memory order for cache efficiency
/// - Uses raw byte slice access instead of get_pixel/put_pixel
/// - Pre-computes all constants outside the loop
/// - Avoids any intermediate buffer allocation
#[inline]
pub fn apply_to_image_buffer_rgb_mut(
    cube_lut: &crate::lut::CubeLut,
    image: &mut ImageBuffer<ImageRgb<u8>, Vec<u8>>,
    lut: &Lut1D,
) {
    let (width, height) = image.dimensions();
    let total_pixels = (width * height) as usize;

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    // Pre-calculate inverse domain ranges
    let inv_domain_range_r = 1.0 / (domain_max[0] - domain_min[0]);
    let inv_domain_range_g = 1.0 / (domain_max[1] - domain_min[1]);
    let inv_domain_range_b = 1.0 / (domain_max[2] - domain_min[2]);

    let inv_255 = 1.0 / 255.0;

    // Get mutable access to raw pixel data
    let pixels = image.as_mut();

    let mut pixel_idx = 0;
    for _ in 0..total_pixels {
        // Read current pixel values
        let r = pixels[pixel_idx] as f32 * inv_255;
        let g = pixels[pixel_idx + 1] as f32 * inv_255;
        let b = pixels[pixel_idx + 2] as f32 * inv_255;

        // Normalize
        let normalized_r = (r - domain_min[0]) * inv_domain_range_r;
        let normalized_g = (g - domain_min[1]) * inv_domain_range_g;
        let normalized_b = (b - domain_min[2]) * inv_domain_range_b;

        // Apply LUT
        let output_rgb = apply_1d_lut_simd(lut, normalized_r, normalized_g, normalized_b);

        // Write back directly to same memory location
        pixels[pixel_idx] = (output_rgb[0].clamp(0.0, 1.0) * 255.0) as u8;
        pixels[pixel_idx + 1] = (output_rgb[1].clamp(0.0, 1.0) * 255.0) as u8;
        pixels[pixel_idx + 2] = (output_rgb[2].clamp(0.0, 1.0) * 255.0) as u8;

        pixel_idx += 3;
    }
}

/// Apply 1D LUT to ImageBuffer<Rgba<u8>, Vec<u8>> in-place (zero allocation)
/// Same optimizations as RGB version, preserves alpha channel
#[inline]
pub fn apply_to_image_buffer_rgba_mut(
    cube_lut: &crate::lut::CubeLut,
    image: &mut ImageBuffer<Rgba<u8>, Vec<u8>>,
    lut: &Lut1D,
) {
    let (width, height) = image.dimensions();
    let total_pixels = (width * height) as usize;

    let domain_min = cube_lut.domain_min;
    let domain_max = cube_lut.domain_max;

    // Pre-calculate inverse domain ranges
    let inv_domain_range_r = 1.0 / (domain_max[0] - domain_min[0]);
    let inv_domain_range_g = 1.0 / (domain_max[1] - domain_min[1]);
    let inv_domain_range_b = 1.0 / (domain_max[2] - domain_min[2]);

    let inv_255 = 1.0 / 255.0;

    // Get mutable access to raw pixel data
    let pixels = image.as_mut();

    let mut pixel_idx = 0;
    for _ in 0..total_pixels {
        // Read current pixel values (skip alpha at +3)
        let r = pixels[pixel_idx] as f32 * inv_255;
        let g = pixels[pixel_idx + 1] as f32 * inv_255;
        let b = pixels[pixel_idx + 2] as f32 * inv_255;
        // Alpha at pixels[pixel_idx + 3] is preserved (not touched)

        // Normalize
        let normalized_r = (r - domain_min[0]) * inv_domain_range_r;
        let normalized_g = (g - domain_min[1]) * inv_domain_range_g;
        let normalized_b = (b - domain_min[2]) * inv_domain_range_b;

        // Apply LUT
        let output_rgb = apply_1d_lut_simd(lut, normalized_r, normalized_g, normalized_b);

        // Write back directly to same memory location (alpha unchanged)
        pixels[pixel_idx] = (output_rgb[0].clamp(0.0, 1.0) * 255.0) as u8;
        pixels[pixel_idx + 1] = (output_rgb[1].clamp(0.0, 1.0) * 255.0) as u8;
        pixels[pixel_idx + 2] = (output_rgb[2].clamp(0.0, 1.0) * 255.0) as u8;
        // pixels[pixel_idx + 3] stays unchanged (alpha preserved)

        pixel_idx += 4;
    }
}