oximedia-accel 0.1.0

//! CPU fallback implementations for hardware acceleration operations.

use crate::error::{AccelError, AccelResult};
use crate::traits::{HardwareAccel, ScaleFilter};
use oximedia_core::PixelFormat;
use rayon::prelude::*;

/// CPU-based fallback acceleration backend.
///
/// This backend uses multi-threaded CPU implementations via rayon
/// when GPU acceleration is unavailable.
pub struct CpuAccel;

impl CpuAccel {
    /// Creates a new CPU acceleration backend.
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Default for CpuAccel {
    fn default() -> Self {
        Self::new()
    }
}

impl HardwareAccel for CpuAccel {
    fn scale_image(
        &self,
        input: &[u8],
        src_width: u32,
        src_height: u32,
        dst_width: u32,
        dst_height: u32,
        format: PixelFormat,
        filter: ScaleFilter,
    ) -> AccelResult<Vec<u8>> {
        let channels = match format {
            PixelFormat::Rgb24 => 3,
            PixelFormat::Rgba32 => 4,
            PixelFormat::Gray8 => 1,
            _ => {
                return Err(AccelError::InvalidFormat(format!(
                    "Unsupported format: {format:?}"
                )))
            }
        };

        let expected_size = (src_width * src_height * channels) as usize;
        if input.len() != expected_size {
            return Err(AccelError::BufferSizeMismatch {
                expected: expected_size,
                actual: input.len(),
            });
        }

        let output_size = (dst_width * dst_height * channels) as usize;
        let mut output = vec![0u8; output_size];

        match filter {
            ScaleFilter::Nearest => {
                scale_nearest(
                    input,
                    &mut output,
                    src_width,
                    src_height,
                    dst_width,
                    dst_height,
                    channels,
                );
            }
            ScaleFilter::Bilinear | ScaleFilter::Bicubic | ScaleFilter::Lanczos => {
                scale_bilinear(
                    input,
                    &mut output,
                    src_width,
                    src_height,
                    dst_width,
                    dst_height,
                    channels,
                );
            }
        }

        Ok(output)
    }

    fn convert_color(
        &self,
        input: &[u8],
        width: u32,
        height: u32,
        src_format: PixelFormat,
        dst_format: PixelFormat,
    ) -> AccelResult<Vec<u8>> {
        match (src_format, dst_format) {
            (PixelFormat::Rgb24, PixelFormat::Yuv420p) => rgb_to_yuv420p(input, width, height),
            (PixelFormat::Yuv420p, PixelFormat::Rgb24) => yuv420p_to_rgb(input, width, height),
            _ => Err(AccelError::Unsupported(format!(
                "Color conversion from {src_format:?} to {dst_format:?} not implemented"
            ))),
        }
    }

    fn motion_estimation(
        &self,
        reference: &[u8],
        current: &[u8],
        width: u32,
        height: u32,
        block_size: u32,
    ) -> AccelResult<Vec<(i16, i16)>> {
        let expected_size = (width * height) as usize;
        if reference.len() != expected_size || current.len() != expected_size {
            return Err(AccelError::BufferSizeMismatch {
                expected: expected_size,
                actual: reference.len().min(current.len()),
            });
        }

        Ok(block_motion_estimation(
            reference, current, width, height, block_size,
        ))
    }
}

/// Nearest neighbor scaling (CPU implementation).
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_sign_loss)]
fn scale_nearest(
    input: &[u8],
    output: &mut [u8],
    src_width: u32,
    src_height: u32,
    dst_width: u32,
    dst_height: u32,
    channels: u32,
) {
    output
        .par_chunks_exact_mut((dst_width * channels) as usize)
        .enumerate()
        .for_each(|(y, row)| {
            let src_y = (y as u32 * src_height) / dst_height;
            for x in 0..dst_width {
                let src_x = (x * src_width) / dst_width;
                let src_idx = ((src_y * src_width + src_x) * channels) as usize;
                let dst_idx = (x * channels) as usize;

                let channels_usize = channels as usize;
                row[dst_idx..dst_idx + channels_usize]
                    .copy_from_slice(&input[src_idx..src_idx + channels_usize]);
            }
        });
}

/// Bilinear scaling (CPU implementation).
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_sign_loss)]
#[allow(clippy::cast_precision_loss)]
fn scale_bilinear(
    input: &[u8],
    output: &mut [u8],
    src_width: u32,
    src_height: u32,
    dst_width: u32,
    dst_height: u32,
    channels: u32,
) {
    let x_ratio = (src_width - 1) as f32 / dst_width as f32;
    let y_ratio = (src_height - 1) as f32 / dst_height as f32;

    output
        .par_chunks_exact_mut((dst_width * channels) as usize)
        .enumerate()
        .for_each(|(y, row)| {
            let src_y = y as f32 * y_ratio;
            let y1 = src_y.floor() as u32;
            let y2 = (y1 + 1).min(src_height - 1);
            let y_frac = src_y - y1 as f32;

            for x in 0..dst_width {
                let src_x = x as f32 * x_ratio;
                let x1 = src_x.floor() as u32;
                let x2 = (x1 + 1).min(src_width - 1);
                let x_frac = src_x - x1 as f32;

                let dst_idx = (x * channels) as usize;

                for c in 0..channels as usize {
                    let p11 = f32::from(input[((y1 * src_width + x1) * channels) as usize + c]);
                    let p12 = f32::from(input[((y2 * src_width + x1) * channels) as usize + c]);
                    let p21 = f32::from(input[((y1 * src_width + x2) * channels) as usize + c]);
                    let p22 = f32::from(input[((y2 * src_width + x2) * channels) as usize + c]);

                    let p1 = p11 * (1.0 - x_frac) + p21 * x_frac;
                    let p2 = p12 * (1.0 - x_frac) + p22 * x_frac;
                    let result = p1 * (1.0 - y_frac) + p2 * y_frac;

                    row[dst_idx + c] = result.clamp(0.0, 255.0) as u8;
                }
            }
        });
}

/// RGB to `YUV420p` conversion (CPU implementation).
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_sign_loss)]
#[allow(clippy::cast_precision_loss)]
fn rgb_to_yuv420p(input: &[u8], width: u32, height: u32) -> AccelResult<Vec<u8>> {
    let expected_size = (width * height * 3) as usize;
    if input.len() != expected_size {
        return Err(AccelError::BufferSizeMismatch {
            expected: expected_size,
            actual: input.len(),
        });
    }

    let y_size = (width * height) as usize;
    let uv_size = (width * height / 4) as usize;
    let mut output = vec![0u8; y_size + uv_size * 2];

    let (y_plane, uv_planes) = output.split_at_mut(y_size);
    let (u_plane, v_plane) = uv_planes.split_at_mut(uv_size);

    // Convert Y plane
    y_plane.par_iter_mut().enumerate().for_each(|(i, y)| {
        let rgb_idx = i * 3;
        let r = f32::from(input[rgb_idx]);
        let g = f32::from(input[rgb_idx + 1]);
        let b = f32::from(input[rgb_idx + 2]);

        *y = (0.299 * r + 0.587 * g + 0.114 * b).clamp(0.0, 255.0) as u8;
    });

    // Convert U and V planes (subsampled 2x2)
    u_plane
        .par_iter_mut()
        .zip(v_plane.par_iter_mut())
        .enumerate()
        .for_each(|(i, (u, v))| {
            let uv_x = (i as u32 % (width / 2)) * 2;
            let uv_y = (i as u32 / (width / 2)) * 2;
            let rgb_idx = ((uv_y * width + uv_x) * 3) as usize;

            let r = f32::from(input[rgb_idx]);
            let g = f32::from(input[rgb_idx + 1]);
            let b = f32::from(input[rgb_idx + 2]);

            *u = (-0.169 * r - 0.331 * g + 0.500 * b + 128.0).clamp(0.0, 255.0) as u8;
            *v = (0.500 * r - 0.419 * g - 0.081 * b + 128.0).clamp(0.0, 255.0) as u8;
        });

    Ok(output)
}

/// `YUV420p` to RGB conversion (CPU implementation).
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_sign_loss)]
#[allow(clippy::cast_precision_loss)]
fn yuv420p_to_rgb(input: &[u8], width: u32, height: u32) -> AccelResult<Vec<u8>> {
    let y_size = (width * height) as usize;
    let uv_size = (width * height / 4) as usize;
    let expected_size = y_size + uv_size * 2;

    if input.len() != expected_size {
        return Err(AccelError::BufferSizeMismatch {
            expected: expected_size,
            actual: input.len(),
        });
    }

    let y_plane = &input[..y_size];
    let u_plane = &input[y_size..y_size + uv_size];
    let v_plane = &input[y_size + uv_size..];

    let output_size = (width * height * 3) as usize;
    let mut output = vec![0u8; output_size];

    output
        .par_chunks_exact_mut(3)
        .enumerate()
        .for_each(|(i, pixel)| {
            let pixel_x = i as u32 % width;
            let pixel_y = i as u32 / width;
            let uv_idx = ((pixel_y / 2) * (width / 2) + (pixel_x / 2)) as usize;

            let y_val = f32::from(y_plane[i]);
            let u_val = f32::from(u_plane[uv_idx]) - 128.0;
            let v_val = f32::from(v_plane[uv_idx]) - 128.0;

            let red = (y_val + 1.402 * v_val).clamp(0.0, 255.0) as u8;
            let green = (y_val - 0.344 * u_val - 0.714 * v_val).clamp(0.0, 255.0) as u8;
            let blue = (y_val + 1.772 * u_val).clamp(0.0, 255.0) as u8;

            pixel[0] = red;
            pixel[1] = green;
            pixel[2] = blue;
        });

    Ok(output)
}

/// Block-based motion estimation (CPU implementation).
#[allow(clippy::cast_possible_truncation)]
#[allow(clippy::cast_possible_wrap)]
fn block_motion_estimation(
    reference: &[u8],
    current: &[u8],
    width: u32,
    height: u32,
    block_size: u32,
) -> Vec<(i16, i16)> {
    let blocks_wide = width.div_ceil(block_size);
    let blocks_high = height.div_ceil(block_size);
    let search_range = 8i32;

    let motion_vectors: Vec<(i16, i16)> = (0..blocks_high)
        .into_par_iter()
        .flat_map(|block_y| {
            (0..blocks_wide).into_par_iter().map(move |block_x| {
                let cur_x = block_x * block_size;
                let cur_y = block_y * block_size;

                let mut best_sad = u32::MAX;
                let mut best_delta_x = 0i16;
                let mut best_delta_y = 0i16;

                for dy in -search_range..=search_range {
                    for dx in -search_range..=search_range {
                        let ref_x = cur_x as i32 + dx;
                        let ref_y = cur_y as i32 + dy;

                        if ref_x < 0
                            || ref_y < 0
                            || ref_x + block_size as i32 > width as i32
                            || ref_y + block_size as i32 > height as i32
                        {
                            continue;
                        }

                        let mut sad = 0u32;
                        for by in 0..block_size {
                            for bx in 0..block_size {
                                #[allow(clippy::cast_sign_loss)]
                                let rx = (ref_x + bx as i32) as u32;
                                #[allow(clippy::cast_sign_loss)]
                                let ry = (ref_y + by as i32) as u32;
                                let cx = cur_x + bx;
                                let cy = cur_y + by;

                                if rx >= width || ry >= height || cx >= width || cy >= height {
                                    continue;
                                }

                                let ref_idx = (ry * width + rx) as usize;
                                let cur_idx = (cy * width + cx) as usize;

                                sad += u32::from(reference[ref_idx].abs_diff(current[cur_idx]));
                            }
                        }

                        if sad < best_sad {
                            best_sad = sad;
                            best_delta_x = dx as i16;
                            best_delta_y = dy as i16;
                        }
                    }
                }

                (best_delta_x, best_delta_y)
            })
        })
        .collect();

    motion_vectors
}