rav1d-safe 0.5.7

//! Safe SIMD implementations of intra prediction functions
#![allow(deprecated)] // FFI wrappers need to forge tokens
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
//!
//! Replaces hand-written assembly with safe Rust intrinsics.
//!
//! Implemented so far:
//! - DC_128 prediction (constant fill with mid-value)
//! - Vertical prediction (copy top row)
//! - Horizontal prediction (fill from left pixels)

#![allow(unused)]

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[cfg(target_arch = "x86_64")]
use archmage::X64V4xToken;
use archmage::{Desktop64, Server64, SimdToken, arcane};
use std::ffi::c_int;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;

#[cfg(target_arch = "x86_64")]
use super::partial_simd;
#[cfg(target_arch = "x86_64")]
use crate::src::safe_simd::pixel_access::{
    Flex, loadu_128, loadu_256, loadu_512, storeu_128, storeu_256, storeu_512,
};

use crate::include::common::bitdepth::DynPixel;
use crate::include::dav1d::picture::PicOffset;
use crate::src::ffi_safe::FFISafe;

// ============================================================================
// DC_128 Prediction (fill with mid-value)
// ============================================================================

/// DC_128 prediction: fill block with 128 (or 1 << (bitdepth - 1))
///
/// For 8bpc, fills with 128. This is the simplest prediction mode.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let fill_val = _mm256_set1_epi8(128u8 as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];

        // Fill row with 128
        let mut x = 0;
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 16;
        }
        while x < width {
            row[x] = 128;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    _topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    ipred_dc_128_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        width as usize,
        height as usize,
    );
}

/// DC_128 prediction using AVX-512 (64-byte stores)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let fill_val = _mm512_set1_epi8(128u8 as i8);
    let fill_256 = _mm256_set1_epi8(128u8 as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];

        let mut x = 0;
        while x + 64 <= width {
            storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_val);
            x += 64;
        }
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 16;
        }
        while x < width {
            row[x] = 128;
            x += 1;
        }
    }
}

/// Vertical prediction using AVX-512 (64-byte loads/stores)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let top_off = tl_off + 1;

    match width {
        4 => {
            let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
                topleft[top_off..top_off + 4].try_into().unwrap(),
            ));
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                dst[row_off..row_off + 4]
                    .copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
            }
        }
        8 => {
            let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&topleft[top_off..top_off + 8]).try_into().unwrap(),
            );
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                partial_simd::mm_storel_epi64::<[u8; 8]>(
                    (&mut dst[row_off..row_off + 8]).try_into().unwrap(),
                    top_val,
                );
            }
        }
        16 => {
            let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
            }
        }
        32 => {
            let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
            }
        }
        64 => {
            // Single 512-bit load instead of 2x 256-bit
            let top_val = loadu_512!((&topleft[top_off..top_off + 64]), [u8; 64]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_512!((&mut dst[row_off..row_off + 64]), [u8; 64], top_val);
            }
        }
        _ => {
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
            }
        }
    }
}

// ============================================================================
// Vertical Prediction (copy top row)
// ============================================================================

/// Vertical prediction: copy the top row to all rows in the block
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Top pixels start at topleft + 1
    let top_off = tl_off + 1;

    // Load top row into register(s)
    match width {
        4 => {
            let top_val = _mm_cvtsi32_si128(i32::from_ne_bytes(
                topleft[top_off..top_off + 4].try_into().unwrap(),
            ));
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                dst[row_off..row_off + 4]
                    .copy_from_slice(&_mm_cvtsi128_si32(top_val).to_ne_bytes());
            }
        }
        8 => {
            let top_val = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&topleft[top_off..top_off + 8]).try_into().unwrap(),
            );
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                partial_simd::mm_storel_epi64::<[u8; 8]>(
                    (&mut dst[row_off..row_off + 8]).try_into().unwrap(),
                    top_val,
                );
            }
        }
        16 => {
            let top_val = loadu_128!((&topleft[top_off..top_off + 16]), [u8; 16]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_128!((&mut dst[row_off..row_off + 16]), [u8; 16], top_val);
            }
        }
        32 => {
            let top_val = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val);
            }
        }
        64 => {
            let top_val0 = loadu_256!((&topleft[top_off..top_off + 32]), [u8; 32]);
            let top_val1 = loadu_256!((&topleft[top_off + 32..top_off + 64]), [u8; 32]);
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                storeu_256!((&mut dst[row_off..row_off + 32]), [u8; 32], top_val0);
                storeu_256!((&mut dst[row_off + 32..row_off + 64]), [u8; 32], top_val1);
            }
        }
        _ => {
            // General case
            for y in 0..height {
                let row_off = (dst_base as isize + y as isize * stride) as usize;
                dst[row_off..row_off + width].copy_from_slice(&topleft[top_off..top_off + width]);
            }
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_v_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// Horizontal Prediction (fill from left pixels)
// ============================================================================

/// Horizontal prediction: fill each row with the left pixel value
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];
        // Left pixels are at topleft - y - 1
        let left_pixel = topleft[tl_off - y - 1];

        // Broadcast pixel value
        let fill_val = _mm256_set1_epi8(left_pixel as i8);

        let mut x = 0;
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 16;
        }
        while x < width {
            row[x] = left_pixel;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_h_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// Horizontal prediction using AVX-512 (64-byte fills)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];
        let left_pixel = topleft[tl_off - y - 1];
        let fill_512 = _mm512_set1_epi8(left_pixel as i8);
        let fill_256 = _mm256_set1_epi8(left_pixel as i8);

        let mut x = 0;
        while x + 64 <= width {
            storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
            x += 64;
        }
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 16;
        }
        while x < width {
            row[x] = left_pixel;
            x += 1;
        }
    }
}

// ============================================================================
// DC Prediction AVX-512 variants (8bpc)
// ============================================================================

/// DC prediction using AVX-512 (64-byte stores)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum: u32 = 0;
    for x in 0..width {
        sum += topleft[tl_off + 1 + x] as u32;
    }
    for y in 0..height {
        sum += topleft[tl_off - y - 1] as u32;
    }
    let total = width + height;
    let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;

    let fill_512 = _mm512_set1_epi8(dc_val as i8);
    let fill_256 = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];
        let mut x = 0;
        while x + 64 <= width {
            storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
            x += 64;
        }
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

/// DC_TOP prediction using AVX-512 (64-byte stores)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum: u32 = 0;
    for x in 0..width {
        sum += topleft[tl_off + 1 + x] as u32;
    }
    let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;

    let fill_512 = _mm512_set1_epi8(dc_val as i8);
    let fill_256 = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];
        let mut x = 0;
        while x + 64 <= width {
            storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
            x += 64;
        }
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

/// DC_LEFT prediction using AVX-512 (64-byte stores)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum: u32 = 0;
    for y in 0..height {
        sum += topleft[tl_off - y - 1] as u32;
    }
    let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;

    let fill_512 = _mm512_set1_epi8(dc_val as i8);
    let fill_256 = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];
        let mut x = 0;
        while x + 64 <= width {
            storeu_512!((&mut row[x..x + 64]), [u8; 64], fill_512);
            x += 64;
        }
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_256);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

// ============================================================================
// DC Prediction (average of top and left)
// ============================================================================

/// DC prediction: fill block with average of top and left edge pixels
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Sum top pixels
    let mut sum: u32 = 0;
    for x in 0..width {
        sum += topleft[tl_off + 1 + x] as u32;
    }
    // Sum left pixels
    for y in 0..height {
        sum += topleft[tl_off - y - 1] as u32;
    }

    // Calculate average (rounded)
    let total = width + height;
    let dc_val = ((sum + (total as u32 >> 1)) / total as u32) as u8;

    // Fill block
    let fill_val = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];

        let mut x = 0;
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_dc_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// DC_TOP prediction: fill block with average of top edge only
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Sum top pixels
    let mut sum: u32 = 0;
    for x in 0..width {
        sum += topleft[tl_off + 1 + x] as u32;
    }

    // Calculate average (rounded)
    let dc_val = ((sum + (width as u32 >> 1)) / width as u32) as u8;

    // Fill block
    let fill_val = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];

        let mut x = 0;
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_dc_top_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// DC_LEFT prediction: fill block with average of left edge only
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Sum left pixels
    let mut sum: u32 = 0;
    for y in 0..height {
        sum += topleft[tl_off - y - 1] as u32;
    }

    // Calculate average (rounded)
    let dc_val = ((sum + (height as u32 >> 1)) / height as u32) as u8;

    // Fill block
    let fill_val = _mm256_set1_epi8(dc_val as i8);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let row = &mut dst[row_off..][..width];

        let mut x = 0;
        while x + 32 <= width {
            storeu_256!((&mut row[x..x + 32]), [u8; 32], fill_val);
            x += 32;
        }
        while x + 16 <= width {
            storeu_128!(
                &mut row[x..x + 16],
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 16;
        }
        while x < width {
            row[x] = dc_val;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_dc_left_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// PAETH Prediction AVX-512
// ============================================================================

/// PAETH prediction 8bpc using AVX-512 — 16 pixels/iter with mask-based blending.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let topleft_val = topleft[tl_off] as i32;
    let topleft_vec = _mm512_set1_epi32(topleft_val);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm512_set1_epi32(left_val);

        let mut x = 0;
        while x + 16 <= width {
            // Load 16 top pixels → i32
            let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
            let top = _mm512_cvtepu8_epi32(top_bytes);

            // base = left + top - topleft
            let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);

            let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
            let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
            let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));

            // AVX-512 mask comparisons: cmpgt returns __mmask16
            // ldiff <= tdiff: !(ldiff > tdiff) = ~(cmpgt(ldiff, tdiff))
            let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
            let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
            let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);

            // use_left = ldiff <= tdiff && ldiff <= tldiff
            let use_left = ld_le_td & ld_le_tld;
            // use_top = !use_left && tdiff <= tldiff
            let use_top = !use_left & td_le_tld;

            // Start with topleft, overlay top where use_top, overlay left where use_left
            let result = _mm512_mask_blend_epi32(
                use_left,
                _mm512_mask_blend_epi32(use_top, topleft_vec, top),
                left_vec,
            );

            // Pack i32→u8 directly (values are 0..255, clamping is safe)
            let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
            let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
            storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);

            x += 16;
        }

        // Scalar fallback
        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let base = left_val + top_val - topleft_val;
            let ldiff = (left_val - base).abs();
            let tdiff = (top_val - base).abs();
            let tldiff = (topleft_val - base).abs();
            let result = if ldiff <= tdiff && ldiff <= tldiff {
                left_val
            } else if tdiff <= tldiff {
                top_val
            } else {
                topleft_val
            };
            row[x] = result as u8;
            x += 1;
        }
    }
}

// ============================================================================
// SMOOTH Prediction AVX-512 (8bpc)
// ============================================================================

/// Smooth prediction 8bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let right_val = topleft[tl_off + width] as i32;
    let bottom_val = topleft[tl_off - height] as i32;
    let right_vec = _mm512_set1_epi32(right_val);
    let bottom_vec = _mm512_set1_epi32(bottom_val);
    let rounding = _mm512_set1_epi32(256);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm512_set1_epi32(left_val);
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm512_set1_epi32(w_v);
        let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 16 <= width {
            // Load 16 top pixels → i32
            let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
            let top = _mm512_cvtepu8_epi32(top_bytes);

            // Load 16 horizontal weights → i32
            let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
            let w_h = _mm512_cvtepu8_epi32(wh_bytes);
            let w_h_inv = _mm512_sub_epi32(c256, w_h);

            let vert = _mm512_add_epi32(
                _mm512_mullo_epi32(w_v_vec, top),
                _mm512_mullo_epi32(w_v_inv, bottom_vec),
            );
            let hor = _mm512_add_epi32(
                _mm512_mullo_epi32(w_h, left_vec),
                _mm512_mullo_epi32(w_h_inv, right_vec),
            );

            let pred = _mm512_add_epi32(vert, hor);
            let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
            storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);

            x += 16;
        }

        // Scalar fallback
        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let w_h = weights_hor[x] as i32;
            let pred =
                w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
            row[x] = ((pred + 256) >> 9) as u8;
            x += 1;
        }
    }
}

/// Smooth_V prediction 8bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let bottom_val = topleft[tl_off - height] as i32;
    let bottom_vec = _mm512_set1_epi32(bottom_val);
    let rounding = _mm512_set1_epi32(128);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm512_set1_epi32(w_v);
        let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 16 <= width {
            let top_bytes = loadu_128!(&topleft[tl_off + 1 + x..tl_off + 1 + x + 16], [u8; 16]);
            let top = _mm512_cvtepu8_epi32(top_bytes);

            let pred = _mm512_add_epi32(
                _mm512_mullo_epi32(w_v_vec, top),
                _mm512_mullo_epi32(w_v_inv, bottom_vec),
            );
            let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
            storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);

            x += 16;
        }

        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let pred = w_v * top_val + (256 - w_v) * bottom_val;
            row[x] = ((pred + 128) >> 8) as u8;
            x += 1;
        }
    }
}

/// Smooth_H prediction 8bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let right_val = topleft[tl_off + width] as i32;
    let right_vec = _mm512_set1_epi32(right_val);
    let rounding = _mm512_set1_epi32(128);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm512_set1_epi32(left_val);

        let mut x = 0;
        while x + 16 <= width {
            let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
            let w_h = _mm512_cvtepu8_epi32(wh_bytes);
            let w_h_inv = _mm512_sub_epi32(c256, w_h);

            let pred = _mm512_add_epi32(
                _mm512_mullo_epi32(w_h, left_vec),
                _mm512_mullo_epi32(w_h_inv, right_vec),
            );
            let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u8: __m128i = _mm512_cvtusepi32_epi8(clamped);
            storeu_128!(&mut dst[row_off + x..row_off + x + 16], [u8; 16], result_u8);

            x += 16;
        }

        let row = &mut dst[row_off..][..width];
        while x < width {
            let w_h = weights_hor[x] as i32;
            let pred = w_h * left_val + (256 - w_h) * right_val;
            row[x] = ((pred + 128) >> 8) as u8;
            x += 1;
        }
    }
}

// ============================================================================
// PAETH Prediction
// ============================================================================

/// PAETH prediction: each pixel is closest of left, top, or topleft to (left + top - topleft)
///
/// For each pixel at (x, y):
///   base = left + top - topleft
///   ldiff = |left - base|
///   tdiff = |top - base|
///   tldiff = |topleft - base|
///   pick whichever of left/top/topleft has smallest diff
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let topleft_val = topleft[tl_off] as i32;
    let topleft_vec = _mm256_set1_epi32(topleft_val);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm256_set1_epi32(left_val);

        // Process 8 pixels at a time with AVX2
        let mut x = 0;
        while x + 8 <= width {
            // Load 8 top pixels and zero-extend to 32-bit
            let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
                    .try_into()
                    .unwrap(),
            );
            let top_lo = _mm256_cvtepu8_epi32(top_bytes);

            // base = left + top - topleft
            let base = _mm256_sub_epi32(_mm256_add_epi32(left_vec, top_lo), topleft_vec);

            // ldiff = |left - base|
            let ldiff = _mm256_abs_epi32(_mm256_sub_epi32(left_vec, base));
            // tdiff = |top - base|
            let tdiff = _mm256_abs_epi32(_mm256_sub_epi32(top_lo, base));
            // tldiff = |topleft - base|
            let tldiff = _mm256_abs_epi32(_mm256_sub_epi32(topleft_vec, base));

            // Comparison: ldiff <= tdiff
            let ld_le_td = _mm256_or_si256(
                _mm256_cmpgt_epi32(tdiff, ldiff),
                _mm256_cmpeq_epi32(ldiff, tdiff),
            );
            // Comparison: ldiff <= tldiff
            let ld_le_tld = _mm256_or_si256(
                _mm256_cmpgt_epi32(tldiff, ldiff),
                _mm256_cmpeq_epi32(ldiff, tldiff),
            );
            // Comparison: tdiff <= tldiff
            let td_le_tld = _mm256_or_si256(
                _mm256_cmpgt_epi32(tldiff, tdiff),
                _mm256_cmpeq_epi32(tdiff, tldiff),
            );

            // if ldiff <= tdiff && ldiff <= tldiff: left
            // else if tdiff <= tldiff: top
            // else: topleft
            let use_left = _mm256_and_si256(ld_le_td, ld_le_tld);
            let use_top = _mm256_andnot_si256(use_left, td_le_tld);

            // Select: start with topleft, blend top if use_top, blend left if use_left
            let result = _mm256_blendv_epi8(
                _mm256_blendv_epi8(topleft_vec, top_lo, use_top),
                left_vec,
                use_left,
            );

            // Pack 32-bit to 8-bit
            let packed = _mm256_shuffle_epi8(
                result,
                _mm256_setr_epi8(
                    0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                ),
            );
            let lo = _mm256_castsi256_si128(packed);
            let hi = _mm256_extracti128_si256::<1>(packed);
            let combined = _mm_unpacklo_epi32(lo, hi);
            partial_simd::mm_storel_epi64::<[u8; 8]>(
                (&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
                combined,
            );

            x += 8;
        }

        // Scalar fallback for remaining pixels
        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let base = left_val + top_val - topleft_val;
            let ldiff = (left_val - base).abs();
            let tdiff = (top_val - base).abs();
            let tldiff = (topleft_val - base).abs();

            let result = if ldiff <= tdiff && ldiff <= tldiff {
                left_val
            } else if tdiff <= tldiff {
                top_val
            } else {
                topleft_val
            };
            row[x] = result as u8;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_paeth_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// SMOOTH Predictions (using weight tables)
// ============================================================================

use crate::src::tables::dav1d_sm_weights;

/// SMOOTH prediction: weighted blend of top/bottom and left/right edges
///
/// pred = w_v[y] * top + (256 - w_v[y]) * bottom + w_h[x] * left + (256 - w_h[x]) * right
/// dst = (pred + 256) >> 9
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let right_val = topleft[tl_off + width] as i32;
    let bottom_val = topleft[tl_off - height] as i32;
    let right_vec = _mm256_set1_epi32(right_val);
    let bottom_vec = _mm256_set1_epi32(bottom_val);
    let rounding = _mm256_set1_epi32(256);
    let c256 = _mm256_set1_epi32(256);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm256_set1_epi32(left_val);
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm256_set1_epi32(w_v);
        let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 8 <= width {
            // Load 8 top pixels
            let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
                    .try_into()
                    .unwrap(),
            );
            let top = _mm256_cvtepu8_epi32(top_bytes);

            // Load 8 horizontal weights
            let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&weights_hor[x..x + 8]).try_into().unwrap(),
            );
            let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
            let w_h_inv = _mm256_sub_epi32(c256, w_h);

            // Vertical component: w_v * top + (256 - w_v) * bottom
            let vert = _mm256_add_epi32(
                _mm256_mullo_epi32(w_v_vec, top),
                _mm256_mullo_epi32(w_v_inv, bottom_vec),
            );

            // Horizontal component: w_h * left + (256 - w_h) * right
            let hor = _mm256_add_epi32(
                _mm256_mullo_epi32(w_h, left_vec),
                _mm256_mullo_epi32(w_h_inv, right_vec),
            );

            // pred = vert + hor, result = (pred + 256) >> 9
            let pred = _mm256_add_epi32(vert, hor);
            let result = _mm256_srai_epi32::<9>(_mm256_add_epi32(pred, rounding));

            // Pack to 8-bit
            let packed = _mm256_shuffle_epi8(
                result,
                _mm256_setr_epi8(
                    0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                ),
            );
            let lo = _mm256_castsi256_si128(packed);
            let hi = _mm256_extracti128_si256::<1>(packed);
            let combined = _mm_unpacklo_epi32(lo, hi);
            partial_simd::mm_storel_epi64::<[u8; 8]>(
                (&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
                combined,
            );

            x += 8;
        }

        // Scalar fallback
        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let w_h = weights_hor[x] as i32;
            let pred =
                w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
            row[x] = ((pred + 256) >> 9) as u8;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_smooth_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// SMOOTH_V prediction: vertical-only weighted blend (top/bottom)
///
/// pred = w_v[y] * top + (256 - w_v[y]) * bottom
/// dst = (pred + 128) >> 8
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let bottom_val = topleft[tl_off - height] as i32;
    let bottom_vec = _mm256_set1_epi32(bottom_val);
    let rounding = _mm256_set1_epi32(128);
    let c256 = _mm256_set1_epi32(256);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm256_set1_epi32(w_v);
        let w_v_inv = _mm256_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 8 <= width {
            // Load 8 top pixels
            let top_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&topleft[tl_off + 1 + x..tl_off + 1 + x + 8])
                    .try_into()
                    .unwrap(),
            );
            let top = _mm256_cvtepu8_epi32(top_bytes);

            // pred = w_v * top + (256 - w_v) * bottom
            let pred = _mm256_add_epi32(
                _mm256_mullo_epi32(w_v_vec, top),
                _mm256_mullo_epi32(w_v_inv, bottom_vec),
            );

            // result = (pred + 128) >> 8
            let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));

            // Pack to 8-bit
            let packed = _mm256_shuffle_epi8(
                result,
                _mm256_setr_epi8(
                    0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                ),
            );
            let lo = _mm256_castsi256_si128(packed);
            let hi = _mm256_extracti128_si256::<1>(packed);
            let combined = _mm_unpacklo_epi32(lo, hi);
            partial_simd::mm_storel_epi64::<[u8; 8]>(
                (&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
                combined,
            );

            x += 8;
        }

        // Scalar fallback
        let row = &mut dst[row_off..][..width];
        while x < width {
            let top_val = topleft[tl_off + 1 + x] as i32;
            let pred = w_v * top_val + (256 - w_v) * bottom_val;
            row[x] = ((pred + 128) >> 8) as u8;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_smooth_v_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// SMOOTH_H prediction: horizontal-only weighted blend (left/right)
///
/// pred = w_h[x] * left + (256 - w_h[x]) * right
/// dst = (pred + 128) >> 8
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let right_val = topleft[tl_off + width] as i32;
    let right_vec = _mm256_set1_epi32(right_val);
    let rounding = _mm256_set1_epi32(128);
    let c256 = _mm256_set1_epi32(256);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_val = topleft[tl_off - y - 1] as i32;
        let left_vec = _mm256_set1_epi32(left_val);

        let mut x = 0;
        while x + 8 <= width {
            // Load 8 horizontal weights
            let w_h_bytes = partial_simd::mm_loadl_epi64::<[u8; 8]>(
                (&weights_hor[x..x + 8]).try_into().unwrap(),
            );
            let w_h = _mm256_cvtepu8_epi32(w_h_bytes);
            let w_h_inv = _mm256_sub_epi32(c256, w_h);

            // pred = w_h * left + (256 - w_h) * right
            let pred = _mm256_add_epi32(
                _mm256_mullo_epi32(w_h, left_vec),
                _mm256_mullo_epi32(w_h_inv, right_vec),
            );

            // result = (pred + 128) >> 8
            let result = _mm256_srai_epi32::<8>(_mm256_add_epi32(pred, rounding));

            // Pack to 8-bit
            let packed = _mm256_shuffle_epi8(
                result,
                _mm256_setr_epi8(
                    0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1,
                    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                ),
            );
            let lo = _mm256_castsi256_si128(packed);
            let hi = _mm256_extracti128_si256::<1>(packed);
            let combined = _mm_unpacklo_epi32(lo, hi);
            partial_simd::mm_storel_epi64::<[u8; 8]>(
                (&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap(),
                combined,
            );

            x += 8;
        }

        // Scalar fallback
        let row = &mut dst[row_off..][..width];
        while x < width {
            let w_h = weights_hor[x] as i32;
            let pred = w_h * left_val + (256 - w_h) * right_val;
            row[x] = ((pred + 128) >> 8) as u8;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_smooth_h_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// FILTER Prediction (filter intra)
// ============================================================================

use crate::src::tables::{FLT_INCR, dav1d_dr_intra_derivative, dav1d_filter_intra_taps, filter_fn};

/// FILTER prediction: uses directional filter taps on 4x2 blocks
///
/// Processes in 4x2 blocks. Each output pixel is:
/// sum = sum(filter[i] * p[i] for i in 0..7)
/// out = (sum + 8) >> 4
///
/// Input pixels:
/// p0 = topleft, p1-p4 = top row (4 pixels), p5-p6 = left column (2 pixels)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    filt_idx: i32,
    topleft_off: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let width = (width / 4) * 4; // Round down to multiple of 4
    let filt_idx = (filt_idx as usize) & 511;

    let filter = &dav1d_filter_intra_taps[filt_idx];

    // Process in 4x2 blocks
    for y in (0..height).step_by(2) {
        let cur_tl_off = topleft_off - y;
        let mut tl_pixel = topleft[tl_off.wrapping_add(cur_tl_off)] as i32;

        let row0_off = (dst_base as isize + y as isize * stride) as usize;
        let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;

        for x in (0..width).step_by(4) {
            // Get top 4 pixels (p1-p4)
            // y=0: from topleft buffer; y>=2: from previously-written output row y-1
            let (p1, p2, p3, p4) = if y == 0 {
                let top_base = tl_off.wrapping_add(topleft_off + 1 + x);
                (
                    topleft[top_base] as i32,
                    topleft[top_base + 1] as i32,
                    topleft[top_base + 2] as i32,
                    topleft[top_base + 3] as i32,
                )
            } else {
                let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
                (
                    dst[top_row + x] as i32,
                    dst[top_row + x + 1] as i32,
                    dst[top_row + x + 2] as i32,
                    dst[top_row + x + 3] as i32,
                )
            };

            // Get left 2 pixels (p5, p6)
            let (p5, p6) = if x == 0 {
                // From original topleft buffer
                let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1));
                (
                    topleft[left_base] as i32,
                    topleft[left_base.wrapping_sub(1)] as i32,
                )
            } else {
                // From previously computed output
                (dst[row0_off + x - 1] as i32, dst[row1_off + x - 1] as i32)
            };

            let p0 = tl_pixel;
            let p = [p0, p1, p2, p3, p4, p5, p6];

            // Process 4x2 = 8 output pixels using filter taps
            let flt = filter.as_slice();
            let mut flt_offset = 0;

            // Row 0 (4 pixels)
            for xx in 0..4 {
                let acc = filter_fn(&flt[flt_offset..], p);
                let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
                dst[row0_off + x + xx] = val;
                flt_offset += FLT_INCR;
            }

            // Row 1 (4 pixels)
            for xx in 0..4 {
                let acc = filter_fn(&flt[flt_offset..], p);
                let val = ((acc + 8) >> 4).clamp(0, 255) as u8;
                dst[row1_off + x + xx] = val;
                flt_offset += FLT_INCR;
            }

            // Update topleft for next 4x2 block (8bpc)
            tl_pixel = p4;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    filt_idx: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_filter_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        filt_idx as i32,
        topleft_off,
    );
}

// ============================================================================
// Z1 Prediction (angular prediction for angles < 90)
// ============================================================================

/// Z1 prediction: directional prediction using top edge only (angles < 90°)
///
/// For each pixel (x, y):
///   xpos = (y + 1) * dx
///   base = (xpos >> 6) + base_inc * x
///   frac = xpos & 0x3e
///   out = (top[base] * (64 - frac) + top[base+1] * frac + 32) >> 6
/// Z1 intra prediction SIMD inner for 8bpc.
/// Builds preprocessed top edge array internally, handles all cases.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;

    let upsample_above = enable_intra_edge_filter
        && (90 - angle) < 40
        && ((width_i + height_i) as usize) <= (16 >> is_sm as usize);

    // Build preprocessed top edge array
    let mut top_out = [0u8; 64 + 64];
    let (top, max_base_x, base_inc);

    if upsample_above {
        upsample_edge_8bpc(
            &mut top_out,
            width_i + height_i,
            topleft,
            tl_off + 1,
            -1,
            width_i + std::cmp::min(width_i, height_i),
        );
        dx <<= 1;
        top = top_out.as_slice();
        max_base_x = (2 * (width_i + height_i) - 2) as usize;
        base_inc = 2usize;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut top_out,
                width_i + height_i,
                0,
                width_i + height_i,
                topleft,
                tl_off + 1,
                -1,
                width_i + std::cmp::min(width_i, height_i),
                filter_strength,
            );
            top = top_out.as_slice();
            max_base_x = (width_i + height_i - 1) as usize;
        } else {
            // No preprocessing needed — use topleft directly
            top = &topleft[tl_off + 1..];
            max_base_x = width + std::cmp::min(width, height) - 1;
        }
        base_inc = 1;
    };

    let top = top.flex();

    let rounding = _mm256_set1_epi16(32);

    for y in 0..height_i {
        let xpos = (y + 1) * dx;
        let frac = (xpos & 0x3e) as i16;
        let inv_frac = (64 - frac) as i16;

        let frac_vec = _mm256_set1_epi16(frac);
        let inv_frac_vec = _mm256_set1_epi16(inv_frac);

        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let base0 = (xpos >> 6) as usize;

        let mut x = 0usize;

        // SIMD path - 16 pixels at a time (non-upsampled consecutive access)
        if base_inc == 1 {
            while x + 16 <= width && base0 + x + 16 < max_base_x {
                let base = base0 + x;

                let t0 = loadu_128!((&top[base..base + 16]), [u8; 16]);
                let t1 = loadu_128!((&top[base + 1..base + 17]), [u8; 16]);

                let t0_w = _mm256_cvtepu8_epi16(t0);
                let t1_w = _mm256_cvtepu8_epi16(t1);

                let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
                let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
                let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
                let result = _mm256_srai_epi16::<6>(sum);

                let packed = _mm256_packus_epi16(result, result);
                let lo = _mm256_castsi256_si128(packed);
                let hi = _mm256_extracti128_si256::<1>(packed);
                let combined = _mm_unpacklo_epi64(lo, hi);
                storeu_128!(
                    (&mut dst[row_off + x..row_off + x + 16]),
                    [u8; 16],
                    combined
                );

                x += 16;
            }
        }

        // Scalar remainder (also handles upsampled stride-2 access)
        while x < width {
            let base = base0 + base_inc * x;
            if base < max_base_x {
                let t0 = top[base] as i32;
                let t1 = top[base + 1] as i32;
                let v = t0 * inv_frac as i32 + t1 * frac as i32;
                dst[row_off + x] = ((v + 32) >> 6) as u8;
            } else {
                let fill_val = top[max_base_x];
                for xx in x..width {
                    dst[row_off + xx] = fill_val;
                }
                break;
            }
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_z1_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
    );
}

/// Z1 intra prediction AVX-512ICL (v4x) inner for 8bpc.
///
/// Identical edge preparation to [`ipred_z1_8bpc_inner`]; the per-row sample
/// blend keeps the (up to 128-byte) edge resident in two ZMM registers and
/// uses `vpermi2b` (`_mm512_permutex2var_epi8`) to gather the per-pixel base
/// samples instead of issuing per-chunk memory loads.
///
/// Bit-exactness: the fill case (`base >= max_base_x`) is reproduced by
/// clamping both gather indices to `max_base_x`. Since `inv_frac + frac == 64`,
/// the blend then yields `(top[max_base_x] * 64 + 32) >> 6 == top[max_base_x]`,
/// matching the scalar/AVX2 flat fill exactly.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_8bpc_v4x_inner(
    _token: X64V4xToken,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;

    let upsample_above = enable_intra_edge_filter
        && (90 - angle) < 40
        && ((width_i + height_i) as usize) <= (16 >> is_sm as usize);

    let mut top_out = [0u8; 64 + 64];
    let (top, max_base_x, base_inc);

    if upsample_above {
        upsample_edge_8bpc(
            &mut top_out,
            width_i + height_i,
            topleft,
            tl_off + 1,
            -1,
            width_i + std::cmp::min(width_i, height_i),
        );
        dx <<= 1;
        top = top_out.as_slice();
        max_base_x = (2 * (width_i + height_i) - 2) as usize;
        base_inc = 2usize;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut top_out,
                width_i + height_i,
                0,
                width_i + height_i,
                topleft,
                tl_off + 1,
                -1,
                width_i + std::cmp::min(width_i, height_i),
                filter_strength,
            );
            top = top_out.as_slice();
            max_base_x = (width_i + height_i - 1) as usize;
        } else {
            top = &topleft[tl_off + 1..];
            max_base_x = width + std::cmp::min(width, height) - 1;
        }
        base_inc = 1;
    };

    // Copy the live edge region into a 128-byte register-resident buffer.
    // We only need samples up to `max_base_x` (inclusive); the tail is reached
    // only via clamped indices, so pad it with the fill value to make any
    // clamped lane correct.
    let edge_len = (max_base_x + 1).min(128);
    let mut ebuf = [0u8; 128];
    let top_f = top.flex();
    for i in 0..edge_len {
        ebuf[i] = top_f[i];
    }
    let fill_val = top_f[max_base_x.min(127)];
    for b in ebuf.iter_mut().skip(edge_len) {
        *b = fill_val;
    }

    let edge_lo = loadu_512!((&ebuf[0..64]), [u8; 64]);
    let edge_hi = loadu_512!((&ebuf[64..128]), [u8; 64]);

    // vpermi2b clamps indices to the low 7 bits (0..127), exactly our edge span.
    let max_idx8 = _mm512_set1_epi8((max_base_x.min(127)) as i8);
    let rounding = _mm512_set1_epi16(32);

    // Per-lane offset 0..63 for the 64-lane gather.
    let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
    let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
    let one8 = _mm512_set1_epi8(1);

    for y in 0..height_i {
        let xpos = (y + 1) * dx;
        let frac = (xpos & 0x3e) as i16;
        let inv_frac = (64 - frac) as i16;

        let frac_vec = _mm512_set1_epi16(frac);
        let inv_frac_vec = _mm512_set1_epi16(inv_frac);

        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let base0 = (xpos >> 6) as usize;

        if base_inc == 1 {
            let base0_v = _mm512_set1_epi8(base0.min(127) as i8);
            let mut x = 0usize;
            while x < width {
                // idx0[lane] = base0 + x + lane ; idx1 = idx0 + 1, both clamped.
                let xbase = _mm512_set1_epi8(x.min(127) as i8);
                let idx0 = _mm512_adds_epu8(_mm512_adds_epu8(base0_v, xbase), lane_off_v);
                let idx0 = _mm512_min_epu8(idx0, max_idx8);
                let idx1 = _mm512_min_epu8(_mm512_adds_epu8(idx0, one8), max_idx8);

                let t0 = _mm512_permutex2var_epi8(edge_lo, idx0, edge_hi);
                let t1 = _mm512_permutex2var_epi8(edge_lo, idx1, edge_hi);

                // Low 32 lanes -> i16 blend (the 32 pixels for this x window).
                let t0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t0));
                let t1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t1));
                let p0 = _mm512_mullo_epi16(t0_lo, inv_frac_vec);
                let p1 = _mm512_mullo_epi16(t1_lo, frac_vec);
                let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding);
                let r = _mm512_srai_epi16::<6>(sblend);
                // Saturating unsigned narrow 32xu16 -> 32xu8, lane-order preserving.
                let out32 = _mm512_cvtusepi16_epi8(r);

                let n = (width - x).min(32);
                let mut tmp = [0u8; 32];
                storeu_256!((&mut tmp), [u8; 32], out32);
                dst[row_off + x..row_off + x + n].copy_from_slice(&tmp[..n]);
                x += 32;
            }
        } else {
            // Upsampled (base_inc == 2): scalar, matching the reference exactly.
            let mut x = 0usize;
            while x < width {
                let base = base0 + base_inc * x;
                if base < max_base_x {
                    let t0 = top_f[base] as i32;
                    let t1 = top_f[base + 1] as i32;
                    let v = t0 * inv_frac as i32 + t1 * frac as i32;
                    dst[row_off + x] = ((v + 32) >> 6) as u8;
                } else {
                    let fv = top_f[max_base_x];
                    for xx in x..width {
                        dst[row_off + xx] = fv;
                    }
                    break;
                }
                x += 1;
            }
        }
    }
}

/// Helper: get filter strength (simplified version)
#[inline]
fn get_filter_strength_simple(wh: i32, angle: i32, is_sm: bool) -> i32 {
    if is_sm {
        match (wh, angle) {
            (..=8, 64..) => 2,
            (..=8, 40..) => 1,
            (..=8, ..) => 0,
            (..=16, 48..) => 2,
            (..=16, 20..) => 1,
            (..=16, ..) => 0,
            (..=24, 4..) => 3,
            (..=24, ..) => 0,
            (..) => 3,
        }
    } else {
        match (wh, angle) {
            (..=8, 56..) => 1,
            (..=8, ..) => 0,
            (..=16, 40..) => 1,
            (..=16, ..) => 0,
            (..=24, 32..) => 3,
            (..=24, 16..) => 2,
            (..=24, 8..) => 1,
            (..=24, ..) => 0,
            (..=32, 32..) => 3,
            (..=32, 4..) => 2,
            (..=32, ..) => 1,
            (..) => 3,
        }
    }
}

// ============================================================================
// Z2 Prediction (angular prediction for angles 90-180)
// ============================================================================

/// Filter edge pixels for Z2 prediction (8bpc version of filter_edge from ipred.rs).
///
/// Z2 prediction: directional prediction using both top AND left edges (angles 90-180).
/// Unlike Z1 (top only) and Z3 (left only), Z2 blends between edges:
/// - When base_x >= 0: interpolate from top edge
/// - When base_x < 0: interpolate from left edge
fn filter_edge_8bpc(
    out: &mut [u8],
    sz: i32,
    lim_from: i32,
    lim_to: i32,
    inp: &[u8],
    in_off: usize,
    from: i32,
    to: i32,
    strength: i32,
) {
    static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
    let mut i = 0;
    while i < std::cmp::min(sz, lim_from) {
        out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
        i += 1;
    }
    while i < std::cmp::min(lim_to, sz) {
        let mut s = 0i32;
        for j in 0..5i32 {
            s += inp[in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize)] as i32
                * KERNEL[(strength - 1) as usize][j as usize] as i32;
        }
        out[i as usize] = ((s + 8) >> 4) as u8;
        i += 1;
    }
    while i < sz {
        out[i as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
        i += 1;
    }
}

/// Upsample edge pixels for Z2 prediction (8bpc version of upsample_edge from ipred.rs).
fn upsample_edge_8bpc(out: &mut [u8], hsz: i32, inp: &[u8], in_off: usize, from: i32, to: i32) {
    let kernel: [i8; 4] = [-1, 9, 9, -1];
    for i in 0..hsz - 1 {
        out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
        let mut s = 0i32;
        for j in 0..4i32 {
            s += inp[in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize)] as i32
                * kernel[j as usize] as i32;
        }
        out[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, 255) as u8;
    }
    let i = hsz - 1;
    out[(i * 2) as usize] = inp[in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize)];
}

/// Z2 intra prediction SIMD inner for 8bpc.
/// Builds preprocessed edge array internally, handles all cases (filter/upsample/plain).
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
    max_width: i32,
    max_height: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
    let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;

    // Determine upsampling
    let upsample_left = enable_intra_edge_filter
        && (180 - angle) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);
    let upsample_above = enable_intra_edge_filter
        && (angle - 90) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    // Build preprocessed edge array (same as scalar ipred_z2_rust)
    let mut edge = [0u8; 64 + 64 + 1];
    let edge_tl = 64usize;

    // Top edge preprocessing
    if upsample_above {
        upsample_edge_8bpc(
            &mut edge[edge_tl..],
            width_i + 1,
            topleft,
            tl_off,
            0,
            width_i + 1,
        );
        dx <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut edge[edge_tl + 1..],
                width_i,
                0,
                max_width,
                topleft,
                tl_off + 1,
                -1,
                width_i,
                filter_strength,
            );
        } else {
            edge[edge_tl + 1..edge_tl + 1 + width]
                .copy_from_slice(&topleft[tl_off + 1..tl_off + 1 + width]);
        }
    }

    // Left edge preprocessing
    if upsample_left {
        upsample_edge_8bpc(
            &mut edge[edge_tl - height * 2..],
            height_i + 1,
            topleft,
            tl_off.wrapping_sub(height),
            0,
            height_i + 1,
        );
        dy <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut edge[edge_tl - height..],
                height_i,
                height_i - max_height,
                height_i,
                topleft,
                tl_off.wrapping_sub(height),
                0,
                height_i + 1,
                filter_strength,
            );
        } else {
            edge[edge_tl - height..edge_tl].copy_from_slice(&topleft[tl_off - height..tl_off]);
        }
    }

    // Corner pixel
    edge[edge_tl] = topleft[tl_off];

    let edge = edge.as_slice().flex();

    let base_inc_x = 1 + upsample_above as usize;
    let left = edge_tl - (1 + upsample_left as usize);

    let rounding = _mm256_set1_epi16(32);

    for y in 0..height_i {
        let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
        let base_x0 = xpos >> 6;
        let frac_x = (xpos & 0x3e) as i16;
        let inv_frac_x = (64 - frac_x) as i16;

        let row_off = (dst_base as isize + y as isize * stride) as usize;

        // left_count = number of left-edge pixels (where base_x0 + base_inc_x * x < 0)
        let left_count = if base_x0 >= 0 {
            0usize
        } else {
            let needed = (-base_x0) as usize;
            needed.div_ceil(base_inc_x).min(width)
        };

        // First: process pixels using left edge (x < left_count)
        let mut x = 0usize;
        while x < left_count {
            let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
            let base_y = ypos >> 6;
            let frac_y = ypos & 0x3e;
            let inv_frac_y = 64 - frac_y;

            let l0_idx = left.wrapping_add_signed(-base_y as isize);
            let l1_idx = left.wrapping_add_signed(-(base_y + 1) as isize);
            let l0 = edge[l0_idx] as i32;
            let l1 = edge[l1_idx] as i32;
            let v = l0 * inv_frac_y + l1 * frac_y;
            dst[row_off + x] = ((v + 32) >> 6) as u8;
            x += 1;
        }

        // Then: process pixels using top edge (x >= left_count, base_x >= 0)
        if base_inc_x == 1 {
            while x + 16 <= width {
                let base_x = (base_x0 + x as i32) as usize;
                let idx = edge_tl + base_x;
                if idx + 17 > edge.len() {
                    break;
                }

                let t0 = loadu_128!((&edge[idx..idx + 16]), [u8; 16]);
                let t1 = loadu_128!((&edge[idx + 1..idx + 17]), [u8; 16]);

                let t0_w = _mm256_cvtepu8_epi16(t0);
                let t1_w = _mm256_cvtepu8_epi16(t1);

                let frac_vec = _mm256_set1_epi16(frac_x);
                let inv_frac_vec = _mm256_set1_epi16(inv_frac_x);

                let prod0 = _mm256_mullo_epi16(t0_w, inv_frac_vec);
                let prod1 = _mm256_mullo_epi16(t1_w, frac_vec);
                let sum = _mm256_add_epi16(_mm256_add_epi16(prod0, prod1), rounding);
                let result = _mm256_srai_epi16::<6>(sum);

                let packed = _mm256_packus_epi16(result, result);
                let lo = _mm256_castsi256_si128(packed);
                let hi = _mm256_extracti128_si256::<1>(packed);
                let combined = _mm_unpacklo_epi64(lo, hi);
                storeu_128!(
                    (&mut dst[row_off + x..row_off + x + 16]),
                    [u8; 16],
                    combined
                );

                x += 16;
            }
        }

        // Scalar remainder (also handles upsampled case with stride-2 edge access)
        while x < width {
            let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
            let idx = edge_tl + base_x;
            if idx + 2 > edge.len() {
                break;
            }
            let t0 = edge[idx] as i32;
            let t1 = edge[idx + 1] as i32;
            let v = t0 * inv_frac_x as i32 + t1 * frac_x as i32;
            dst[row_off + x] = ((v + 32) >> 6) as u8;
            x += 1;
        }
    }
}

/// Z2 intra prediction AVX-512ICL (v4x) inner for 8bpc.
///
/// z2 blends top and left edges. The left-edge portion (per-pixel varying
/// frac_y) and the scalar tail are kept identical to the reference; only the
/// ascending top-edge portion is upgraded to a 32-wide `vpermi2b` gather over a
/// register-resident copy of `edge[edge_tl..]`. Because every pixel uses the
/// same blend formula in either path, vectorizing the top portion up to the
/// point where both `edge[idx]` and `edge[idx+1]` are in range is byte-exact;
/// the scalar remainder (with the identical `idx + 2 > edge.len()` bound)
/// covers the rest. Non-unit `base_inc_x` (upsampled above) is handled by the
/// scalar remainder exactly as in the reference.
#[cfg(target_arch = "x86_64")]
#[arcane]
#[allow(clippy::too_many_arguments)]
fn ipred_z2_8bpc_v4x_inner(
    _token: X64V4xToken,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
    max_width: i32,
    max_height: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
    let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;

    let upsample_left = enable_intra_edge_filter
        && (180 - angle) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);
    let upsample_above = enable_intra_edge_filter
        && (angle - 90) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    let mut edge = [0u8; 64 + 64 + 1];
    let edge_tl = 64usize;

    if upsample_above {
        upsample_edge_8bpc(
            &mut edge[edge_tl..],
            width_i + 1,
            topleft,
            tl_off,
            0,
            width_i + 1,
        );
        dx <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut edge[edge_tl + 1..],
                width_i,
                0,
                max_width,
                topleft,
                tl_off + 1,
                -1,
                width_i,
                filter_strength,
            );
        } else {
            edge[edge_tl + 1..edge_tl + 1 + width]
                .copy_from_slice(&topleft[tl_off + 1..tl_off + 1 + width]);
        }
    }

    if upsample_left {
        upsample_edge_8bpc(
            &mut edge[edge_tl - height * 2..],
            height_i + 1,
            topleft,
            tl_off.wrapping_sub(height),
            0,
            height_i + 1,
        );
        dy <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut edge[edge_tl - height..],
                height_i,
                height_i - max_height,
                height_i,
                topleft,
                tl_off.wrapping_sub(height),
                0,
                height_i + 1,
                filter_strength,
            );
        } else {
            edge[edge_tl - height..edge_tl].copy_from_slice(&topleft[tl_off - height..tl_off]);
        }
    }

    edge[edge_tl] = topleft[tl_off];

    let edge_len = edge.len();
    let edge = edge.as_slice().flex();

    let base_inc_x = 1 + upsample_above as usize;
    let left = edge_tl - (1 + upsample_left as usize);

    // Register-resident copy of the ascending top edge: tbuf[k] = edge[edge_tl+k].
    // The top portion reaches at most edge index edge_len-1, so k spans
    // 0..=(edge_len-1-edge_tl) which is <= 64; pad the rest so clamped lanes
    // are harmless (they are never stored — bounded by `top_k_max`).
    let top_k_max = edge_len - 1 - edge_tl; // = 64
    let mut tbuf = [0u8; 128];
    for k in 0..=top_k_max {
        tbuf[k] = edge[edge_tl + k];
    }
    let top_lo = loadu_512!((&tbuf[0..64]), [u8; 64]);
    let top_hi = loadu_512!((&tbuf[64..128]), [u8; 64]);
    let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
    let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
    let one8 = _mm512_set1_epi8(1);
    let rounding512 = _mm512_set1_epi16(32);

    for y in 0..height_i {
        let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
        let base_x0 = xpos >> 6;
        let frac_x = (xpos & 0x3e) as i16;
        let inv_frac_x = (64 - frac_x) as i16;

        let row_off = (dst_base as isize + y as isize * stride) as usize;

        let left_count = if base_x0 >= 0 {
            0usize
        } else {
            let needed = (-base_x0) as usize;
            needed.div_ceil(base_inc_x).min(width)
        };

        // Left-edge portion — identical to the reference (scalar).
        let mut x = 0usize;
        while x < left_count {
            let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
            let base_y = ypos >> 6;
            let frac_y = ypos & 0x3e;
            let inv_frac_y = 64 - frac_y;

            let l0_idx = left.wrapping_add_signed(-base_y as isize);
            let l1_idx = left.wrapping_add_signed(-(base_y + 1) as isize);
            let l0 = edge[l0_idx] as i32;
            let l1 = edge[l1_idx] as i32;
            let v = l0 * inv_frac_y + l1 * frac_y;
            dst[row_off + x] = ((v + 32) >> 6) as u8;
            x += 1;
        }

        // Top-edge portion — 32-wide vpermi2b gather (only for base_inc_x == 1).
        if base_inc_x == 1 {
            let frac_vec = _mm512_set1_epi16(frac_x);
            let inv_frac_vec = _mm512_set1_epi16(inv_frac_x);
            // Vectorize while a full 32-lane window stays in range:
            //   idx       = edge_tl + base_x0 + x + lane         (need <= edge_len-1)
            //   idx + 1   <= edge_len-1  =>  base_x0 + x + 31 + 1 <= top_k_max
            while x + 32 <= width {
                let base_x = base_x0 + x as i32; // >= 0 here (x >= left_count)
                // last lane uses k = base_x + 31, and reads k+1; require k+1 <= top_k_max
                if (base_x as usize) + 31 + 1 > top_k_max {
                    break;
                }
                let k0 = _mm512_set1_epi8((base_x as usize).min(127) as i8);
                let idx0 = _mm512_adds_epu8(k0, lane_off_v);
                let idx1 = _mm512_adds_epu8(idx0, one8);

                let t0 = _mm512_permutex2var_epi8(top_lo, idx0, top_hi);
                let t1 = _mm512_permutex2var_epi8(top_lo, idx1, top_hi);

                let t0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t0));
                let t1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(t1));
                let p0 = _mm512_mullo_epi16(t0_lo, inv_frac_vec);
                let p1 = _mm512_mullo_epi16(t1_lo, frac_vec);
                let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding512);
                let r = _mm512_srai_epi16::<6>(sblend);
                let out32 = _mm512_cvtusepi16_epi8(r);

                let mut tmp = [0u8; 32];
                storeu_256!((&mut tmp), [u8; 32], out32);
                dst[row_off + x..row_off + x + 32].copy_from_slice(&tmp);
                x += 32;
            }
        }

        // Scalar remainder — identical to the reference (covers tail, the
        // 16<chunk<32 leftover, and the upsampled base_inc_x == 2 case).
        while x < width {
            let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
            let idx = edge_tl + base_x;
            if idx + 2 > edge_len {
                break;
            }
            let t0 = edge[idx] as i32;
            let t1 = edge[idx + 1] as i32;
            let v = t0 * inv_frac_x as i32 + t1 * frac_x as i32;
            dst[row_off + x] = ((v + 32) >> 6) as u8;
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    max_width: c_int,
    max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_z2_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
        max_width as i32,
        max_height as i32,
    );
}

// ============================================================================
// Z3 Prediction (angular prediction for angles > 180)
// ============================================================================

/// Z3 prediction: directional prediction using left edge only (angles > 180°)
///
/// Z3 is the mirror of Z1, using the left edge instead of top.
/// Builds preprocessed left edge array internally, handles all cases.
/// Loop order is column-major (outer x, inner y).
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;

    let upsample_left = enable_intra_edge_filter
        && (angle - 180) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    // Build preprocessed left edge array
    // Scalar uses: left[left_off - base] to index from corner outward
    let mut left_out = [0u8; 64 + 64];
    let (left, left_off, max_base_y, base_inc);

    if upsample_left {
        upsample_edge_8bpc(
            &mut left_out,
            width_i + height_i,
            topleft,
            tl_off - (width + height),
            std::cmp::max(width_i - height_i, 0),
            width_i + height_i + 1,
        );
        left_off = (2 * (width_i + height_i) - 2) as usize;
        max_base_y = left_off;
        dy <<= 1;
        base_inc = 2usize;
        left = left_out.as_slice();
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut left_out,
                width_i + height_i,
                0,
                width_i + height_i,
                topleft,
                tl_off - (width + height),
                std::cmp::max(width_i - height_i, 0),
                width_i + height_i + 1,
                filter_strength,
            );
            left_off = (width_i + height_i - 1) as usize;
            max_base_y = left_off;
            left = left_out.as_slice();
        } else {
            // No preprocessing — use topleft directly
            left = topleft;
            left_off = tl_off - 1;
            max_base_y = height + std::cmp::min(width, height) - 1;
        }
        base_inc = 1;
    };

    let left = left.flex();

    // Column-major access pattern
    for x in 0..width {
        let ypos = dy * (x + 1);
        let frac = (ypos & 0x3e) as i32;
        let inv_frac = 64 - frac;

        for y in 0..height_i {
            let base = (ypos >> 6) + base_inc * y as usize;

            if base < max_base_y {
                let l0 = left[left_off - base] as i32;
                let l1 = left[left_off - base - 1] as i32;
                let v = l0 * inv_frac + l1 * frac;
                let pixel_off = (dst_base as isize + y as isize * stride) as usize + x;
                dst[pixel_off] = ((v + 32) >> 6) as u8;
            } else {
                let fill_val = left[left_off - max_base_y];
                for yy in y..height_i {
                    let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x;
                    dst[pixel_off] = fill_val;
                }
                break;
            }
        }
    }
}

/// Z3 intra prediction AVX-512ICL (v4x) inner for 8bpc.
///
/// The reference (`ipred_z3_8bpc_inner`) is fully scalar: column-major, with a
/// *descending* edge access `left[left_off - base]`. This kernel reverses the
/// live edge into an ascending, register-resident buffer (`lbuf[k] =
/// left[left_off - k]`) so the per-column blend can gather samples with
/// `vpermi2b` (`_mm512_permutex2var_epi8`) over 32 y-values at once, then
/// narrows with `_mm512_cvtusepi16_epi8`. dst stores stay strided (one byte per
/// y), matching the column-major output layout.
///
/// Bit-exactness: the flat-fill case (`base >= max_base_y`, which writes
/// `left[left_off - max_base_y]`) is reproduced by padding `lbuf[k > max_base_y]
/// = left[left_off - max_base_y]` and clamping *both* gather indices to
/// `max_base_y`. With `inv_frac + frac == 64` the blend then yields the fill
/// value exactly, matching the reference.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_8bpc_v4x_inner(
    _token: X64V4xToken,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;

    let upsample_left = enable_intra_edge_filter
        && (angle - 180) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    let mut left_out = [0u8; 64 + 64];
    let (left, left_off, max_base_y, base_inc);

    if upsample_left {
        upsample_edge_8bpc(
            &mut left_out,
            width_i + height_i,
            topleft,
            tl_off - (width + height),
            std::cmp::max(width_i - height_i, 0),
            width_i + height_i + 1,
        );
        left_off = (2 * (width_i + height_i) - 2) as usize;
        max_base_y = left_off;
        dy <<= 1;
        base_inc = 2usize;
        left = left_out.as_slice();
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            filter_edge_8bpc(
                &mut left_out,
                width_i + height_i,
                0,
                width_i + height_i,
                topleft,
                tl_off - (width + height),
                std::cmp::max(width_i - height_i, 0),
                width_i + height_i + 1,
                filter_strength,
            );
            left_off = (width_i + height_i - 1) as usize;
            max_base_y = left_off;
            left = left_out.as_slice();
        } else {
            left = topleft;
            left_off = tl_off - 1;
            max_base_y = height + std::cmp::min(width, height) - 1;
        }
        base_inc = 1;
    };

    let left_f = left.flex();

    // Reverse the live edge into an ascending, register-resident buffer:
    // lbuf[k] = left[left_off - k] for k in 0..=max_base_y; pad the tail with
    // the fill value (lbuf[max_base_y]) so clamped lanes read the fill exactly.
    let last = max_base_y.min(127);
    let mut lbuf = [0u8; 128];
    for k in 0..=last {
        lbuf[k] = left_f[left_off - k];
    }
    let fill_val = lbuf[last];
    for b in lbuf.iter_mut().skip(last + 1) {
        *b = fill_val;
    }

    let edge_lo = loadu_512!((&lbuf[0..64]), [u8; 64]);
    let edge_hi = loadu_512!((&lbuf[64..128]), [u8; 64]);

    let max_idx8 = _mm512_set1_epi8(last as i8);
    let rounding = _mm512_set1_epi16(32);
    let lane_off: [u8; 64] = core::array::from_fn(|i| i as u8);
    let lane_off_v = loadu_512!((&lane_off), [u8; 64]);
    let one8 = _mm512_set1_epi8(1);

    if base_inc == 1 {
        for x in 0..width {
            let ypos = dy * (x + 1);
            let frac = (ypos & 0x3e) as i16;
            let inv_frac = (64 - frac) as i16;
            let frac_vec = _mm512_set1_epi16(frac);
            let inv_frac_vec = _mm512_set1_epi16(inv_frac);
            let base0 = ypos >> 6;
            let base0_v = _mm512_set1_epi8(base0.min(127) as i8);

            let mut y = 0usize;
            while y < height {
                // idx0[lane] = base0 + y + lane ; idx1 = idx0 + 1, both clamped.
                let ybase = _mm512_set1_epi8(y.min(127) as i8);
                let idx0 = _mm512_adds_epu8(_mm512_adds_epu8(base0_v, ybase), lane_off_v);
                let idx0 = _mm512_min_epu8(idx0, max_idx8);
                let idx1 = _mm512_min_epu8(_mm512_adds_epu8(idx0, one8), max_idx8);

                let l0 = _mm512_permutex2var_epi8(edge_lo, idx0, edge_hi);
                let l1 = _mm512_permutex2var_epi8(edge_lo, idx1, edge_hi);

                let l0_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(l0));
                let l1_lo = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(l1));
                let p0 = _mm512_mullo_epi16(l0_lo, inv_frac_vec);
                let p1 = _mm512_mullo_epi16(l1_lo, frac_vec);
                let sblend = _mm512_add_epi16(_mm512_add_epi16(p0, p1), rounding);
                let r = _mm512_srai_epi16::<6>(sblend);
                let out32 = _mm512_cvtusepi16_epi8(r);

                let n = (height - y).min(32);
                let mut tmp = [0u8; 32];
                storeu_256!((&mut tmp), [u8; 32], out32);
                // Strided byte store, matching column-major output.
                for k in 0..n {
                    let off = (dst_base as isize + (y + k) as isize * stride) as usize + x;
                    dst[off] = tmp[k];
                }
                y += 32;
            }
        }
    } else {
        // Upsampled (base_inc == 2): scalar, matching the reference exactly.
        for x in 0..width {
            let ypos = dy * (x + 1);
            let frac = (ypos & 0x3e) as i32;
            let inv_frac = 64 - frac;
            for y in 0..height_i {
                let base = (ypos >> 6) + base_inc * y as usize;
                if base < max_base_y {
                    let l0 = left_f[left_off - base] as i32;
                    let l1 = left_f[left_off - base - 1] as i32;
                    let v = l0 * inv_frac + l1 * frac;
                    let pixel_off = (dst_base as isize + y as isize * stride) as usize + x;
                    dst[pixel_off] = ((v + 32) >> 6) as u8;
                } else {
                    let fv = left_f[left_off - max_base_y];
                    for yy in y..height_i {
                        let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x;
                        dst[pixel_off] = fv;
                    }
                    break;
                }
            }
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_8bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) =
        compute_topleft_slice(topleft as *const u8, width as usize, height as usize);
    ipred_z3_8bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
    );
}

/// Compute a conservative buffer length for ipred dst buffers.
#[cfg(target_arch = "x86_64")]
fn compute_ipred_buf_len(stride: isize, width: usize, height: usize) -> usize {
    height.saturating_sub(1) * stride.unsigned_abs() + width
}

/// Construct a topleft slice + offset from a raw pointer.
///
/// The topleft pointer points to the center pixel of a scratch edge buffer.
/// We need both positive offsets (top row) and negative offsets (left column).
/// Returns (slice, offset_of_center_in_slice).
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
unsafe fn compute_topleft_slice<'a>(
    tl_ptr: *const u8,
    width: usize,
    height: usize,
) -> (&'a [u8], usize) {
    // Conservative bounds: need up to height+2 below and width+height+2 above
    let neg_reach = height + 2;
    let pos_reach = width + height + 2;
    let total = neg_reach + pos_reach;
    let base = unsafe { tl_ptr.sub(neg_reach) };
    (
        unsafe { std::slice::from_raw_parts(base, total) },
        neg_reach,
    )
}

// ============================================================================
// 16bpc IMPLEMENTATIONS
// ============================================================================

/// DC_128 prediction for 16bpc: fill block with mid-value
///
/// For 10bpc: fill with 512 (1 << 9)
/// For 12bpc: fill with 2048 (1 << 11)
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    width: usize,
    height: usize,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    // Mid-value is (bitdepth_max + 1) / 2
    let mid_val = ((bitdepth_max + 1) / 2) as u16;
    let fill_val = _mm256_set1_epi16(mid_val as i16);
    let width_bytes = width * 2;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        // Process 16 pixels at a time (256-bit / 16-bit = 16 pixels)
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
            x += 16;
        }

        // Process 8 pixels at a time
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 8;
        }

        // Remaining pixels
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_128_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    _topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    ipred_dc_128_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        width as usize,
        height as usize,
        bitdepth_max as i32,
    );
}

/// Vertical prediction for 16bpc: copy top row to all rows
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Top pixels start at topleft + 1 pixel = tl_off + 2 bytes
    let top_off = tl_off + 2;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        // Process 16 pixels at a time
        while x + 16 <= width {
            let load_off = top_off + x * 2;
            let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
            let store_off = row_off + x * 2;
            storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
            x += 16;
        }

        // Process 8 pixels at a time
        while x + 8 <= width {
            let load_off = top_off + x * 2;
            let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
            let store_off = row_off + x * 2;
            storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
            x += 8;
        }

        // Remaining pixels
        while x < width {
            let load_off = top_off + x * 2;
            let store_off = row_off + x * 2;
            dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_v_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_v_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// Horizontal prediction for 16bpc: fill each row with its left pixel
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        // Left pixel for this row: topleft[-(y+1)] in u16 units = tl_off - (y+1)*2 in bytes
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        );
        let fill_val = _mm256_set1_epi16(left_val as i16);

        let mut x = 0usize;

        // Process 16 pixels at a time
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
            x += 16;
        }

        // Process 8 pixels at a time
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 8;
        }

        // Remaining pixels
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_h_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_h_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// DC_128 prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_128_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    width: usize,
    height: usize,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    let mid_val = ((bitdepth_max + 1) / 2) as u16;
    let fill_512 = _mm512_set1_epi16(mid_val as i16);
    let fill_256 = _mm256_set1_epi16(mid_val as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        // 32 pixels at a time (512-bit / 16-bit = 32 pixels)
        while x + 32 <= width {
            let off = row_off + x * 2;
            storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
            x += 32;
        }
        // 16 pixels at a time
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
            x += 16;
        }
        // 8 pixels at a time
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 8;
        }
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&mid_val.to_ne_bytes());
            x += 1;
        }
    }
}

/// Vertical prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_v_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let top_off = tl_off + 2; // +1 pixel = +2 bytes for u16

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        // 32 pixels at a time (64 bytes)
        while x + 32 <= width {
            let load_off = top_off + x * 2;
            let top_vals = loadu_512!((&topleft[load_off..load_off + 64]), [u8; 64]);
            let store_off = row_off + x * 2;
            storeu_512!((&mut dst[store_off..store_off + 64]), [u8; 64], top_vals);
            x += 32;
        }
        // 16 pixels at a time (32 bytes)
        while x + 16 <= width {
            let load_off = top_off + x * 2;
            let top_vals = loadu_256!((&topleft[load_off..load_off + 32]), [u8; 32]);
            let store_off = row_off + x * 2;
            storeu_256!((&mut dst[store_off..store_off + 32]), [u8; 32], top_vals);
            x += 16;
        }
        // 8 pixels at a time
        while x + 8 <= width {
            let load_off = top_off + x * 2;
            let top_vals = loadu_128!((&topleft[load_off..load_off + 16]), [u8; 16]);
            let store_off = row_off + x * 2;
            storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], top_vals);
            x += 8;
        }
        while x < width {
            let load_off = top_off + x * 2;
            let store_off = row_off + x * 2;
            dst[store_off..store_off + 2].copy_from_slice(&topleft[load_off..load_off + 2]);
            x += 1;
        }
    }
}

/// Horizontal prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_h_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        );
        let fill_512 = _mm512_set1_epi16(left_val as i16);
        let fill_256 = _mm256_set1_epi16(left_val as i16);

        let mut x = 0usize;
        while x + 32 <= width {
            let off = row_off + x * 2;
            storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
            x += 32;
        }
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
            x += 16;
        }
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 8;
        }
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&left_val.to_ne_bytes());
            x += 1;
        }
    }
}

// ============================================================================
// DC Prediction AVX-512 variants (16bpc)
// ============================================================================

/// DC prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum = 0u32;
    for i in 1..=width {
        let off = tl_off + i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    for i in 1..=height {
        let off = tl_off - i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    let count = (width + height) as u32;
    let avg = ((sum + count / 2) / count) as u16;

    let fill_512 = _mm512_set1_epi16(avg as i16);
    let fill_256 = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;
        while x + 32 <= width {
            let off = row_off + x * 2;
            storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
            x += 32;
        }
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
            x += 16;
        }
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 8;
        }
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

/// DC_TOP prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum = 0u32;
    for i in 1..=width {
        let off = tl_off + i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    let avg = ((sum + width as u32 / 2) / width as u32) as u16;

    let fill_512 = _mm512_set1_epi16(avg as i16);
    let fill_256 = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;
        while x + 32 <= width {
            let off = row_off + x * 2;
            storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
            x += 32;
        }
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
            x += 16;
        }
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 8;
        }
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

/// DC_LEFT prediction for 16bpc using AVX-512
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let mut sum = 0u32;
    for i in 1..=height {
        let off = tl_off - i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    let avg = ((sum + height as u32 / 2) / height as u32) as u16;

    let fill_512 = _mm512_set1_epi16(avg as i16);
    let fill_256 = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;
        while x + 32 <= width {
            let off = row_off + x * 2;
            storeu_512!((&mut dst[off..off + 64]), [u8; 64], fill_512);
            x += 32;
        }
        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_256);
            x += 16;
        }
        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_256)
            );
            x += 8;
        }
        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

/// DC prediction for 16bpc: average of top and left edge pixels
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Calculate average of top row and left column
    let mut sum = 0u32;

    // Sum top row: tl[1..=width] in pixel units = tl_off + 2..tl_off + 2 + width*2 in bytes
    for i in 1..=width {
        let off = tl_off + i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }

    // Sum left column: tl[-1..-height] in pixel units
    for i in 1..=height {
        let off = tl_off - i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }

    // Average with rounding
    let count = (width + height) as u32;
    let avg = ((sum + count / 2) / count) as u16;

    let fill_val = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
            x += 16;
        }

        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 8;
        }

        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_dc_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// DC_TOP prediction for 16bpc: average of top edge pixels
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_top_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Calculate average of top row
    let mut sum = 0u32;
    for i in 1..=width {
        let off = tl_off + i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    let avg = ((sum + width as u32 / 2) / width as u32) as u16;

    let fill_val = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
            x += 16;
        }

        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 8;
        }

        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_top_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_dc_top_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// DC_LEFT prediction for 16bpc: average of left edge pixels
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_dc_left_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    // Calculate average of left column
    let mut sum = 0u32;
    for i in 1..=height {
        let off = tl_off - i * 2;
        sum += u16::from_ne_bytes(topleft[off..off + 2].try_into().unwrap()) as u32;
    }
    let avg = ((sum + height as u32 / 2) / height as u32) as u16;

    let fill_val = _mm256_set1_epi16(avg as i16);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let mut x = 0usize;

        while x + 16 <= width {
            let off = row_off + x * 2;
            storeu_256!((&mut dst[off..off + 32]), [u8; 32], fill_val);
            x += 16;
        }

        while x + 8 <= width {
            let off = row_off + x * 2;
            storeu_128!(
                (&mut dst[off..off + 16]),
                [u8; 16],
                _mm256_castsi256_si128(fill_val)
            );
            x += 8;
        }

        while x < width {
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&avg.to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_dc_left_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_dc_left_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// PAETH/SMOOTH 16bpc AVX-512
// ============================================================================

/// PAETH prediction 16bpc using AVX-512 — 16 pixels/iter with mask blending.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;
    let topleft_vec = _mm512_set1_epi32(topleft_val);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;
        let left_vec = _mm512_set1_epi32(left_val);

        let mut x = 0;
        while x + 16 <= width {
            let top_byte_off = tl_off + (x + 1) * 2;
            let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
            let top = _mm512_cvtepu16_epi32(top_u16);

            let base = _mm512_sub_epi32(_mm512_add_epi32(left_vec, top), topleft_vec);
            let ldiff = _mm512_abs_epi32(_mm512_sub_epi32(left_vec, base));
            let tdiff = _mm512_abs_epi32(_mm512_sub_epi32(top, base));
            let tldiff = _mm512_abs_epi32(_mm512_sub_epi32(topleft_vec, base));

            let ld_le_td = !_mm512_cmpgt_epi32_mask(ldiff, tdiff);
            let ld_le_tld = !_mm512_cmpgt_epi32_mask(ldiff, tldiff);
            let td_le_tld = !_mm512_cmpgt_epi32_mask(tdiff, tldiff);

            let use_left = ld_le_td & ld_le_tld;
            let use_top = !use_left & td_le_tld;

            let result = _mm512_mask_blend_epi32(
                use_left,
                _mm512_mask_blend_epi32(use_top, topleft_vec, top),
                left_vec,
            );

            // Pack i32→u16 (values are 0..bitdepth_max, unsigned saturation is fine)
            let clamped = _mm512_max_epi32(result, _mm512_setzero_si512());
            let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
            let off = row_off + x * 2;
            storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);

            x += 16;
        }

        // Scalar fallback
        while x < width {
            let top_byte_off = tl_off + (x + 1) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;
            let base = left_val + top_val - topleft_val;
            let l_diff = (left_val - base).abs();
            let t_diff = (top_val - base).abs();
            let tl_diff = (topleft_val - base).abs();
            let pred = if l_diff <= t_diff && l_diff <= tl_diff {
                left_val
            } else if t_diff <= tl_diff {
                top_val
            } else {
                topleft_val
            };
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
            x += 1;
        }
    }
}

/// SMOOTH prediction 16bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let right_off = tl_off + width * 2;
    let right_val =
        u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
    let bottom_off = tl_off - height * 2;
    let bottom_val =
        u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
    let right_vec = _mm512_set1_epi32(right_val);
    let bottom_vec = _mm512_set1_epi32(bottom_val);
    let rounding = _mm512_set1_epi32(256);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;
        let left_vec = _mm512_set1_epi32(left_val);
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm512_set1_epi32(w_v);
        let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 16 <= width {
            let top_byte_off = tl_off + (x + 1) * 2;
            let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
            let top = _mm512_cvtepu16_epi32(top_u16);

            let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
            let w_h = _mm512_cvtepu8_epi32(wh_bytes);
            let w_h_inv = _mm512_sub_epi32(c256, w_h);

            let vert = _mm512_add_epi32(
                _mm512_mullo_epi32(w_v_vec, top),
                _mm512_mullo_epi32(w_v_inv, bottom_vec),
            );
            let hor = _mm512_add_epi32(
                _mm512_mullo_epi32(w_h, left_vec),
                _mm512_mullo_epi32(w_h_inv, right_vec),
            );

            let pred = _mm512_add_epi32(vert, hor);
            let result = _mm512_srai_epi32::<9>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
            let off = row_off + x * 2;
            storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);

            x += 16;
        }

        while x < width {
            let top_byte_off = tl_off + (1 + x) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;
            let w_h = weights_hor[x] as i32;
            let pred =
                w_v * top_val + (256 - w_v) * bottom_val + w_h * left_val + (256 - w_h) * right_val;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(((pred + 256) >> 9) as u16).to_ne_bytes());
            x += 1;
        }
    }
}

/// SMOOTH_V prediction 16bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let bottom_off = tl_off - height * 2;
    let bottom_val =
        u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;
    let bottom_vec = _mm512_set1_epi32(bottom_val);
    let rounding = _mm512_set1_epi32(128);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let w_v = weights_ver[y] as i32;
        let w_v_vec = _mm512_set1_epi32(w_v);
        let w_v_inv = _mm512_sub_epi32(c256, w_v_vec);

        let mut x = 0;
        while x + 16 <= width {
            let top_byte_off = tl_off + (x + 1) * 2;
            let top_u16 = loadu_256!(&topleft[top_byte_off..top_byte_off + 32], [u8; 32]);
            let top = _mm512_cvtepu16_epi32(top_u16);

            let pred = _mm512_add_epi32(
                _mm512_mullo_epi32(w_v_vec, top),
                _mm512_mullo_epi32(w_v_inv, bottom_vec),
            );
            let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
            let off = row_off + x * 2;
            storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);

            x += 16;
        }

        while x < width {
            let top_byte_off = tl_off + (1 + x) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;
            let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
            x += 1;
        }
    }
}

/// SMOOTH_H prediction 16bpc using AVX-512 — 16 pixels/iter.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_avx512_inner(
    _token: Server64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let right_off = tl_off + width * 2;
    let right_val =
        u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
    let right_vec = _mm512_set1_epi32(right_val);
    let rounding = _mm512_set1_epi32(128);
    let c256 = _mm512_set1_epi32(256);
    let zero_512 = _mm512_setzero_si512();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;
        let left_vec = _mm512_set1_epi32(left_val);

        let mut x = 0;
        while x + 16 <= width {
            let wh_bytes = loadu_128!(&weights_hor[x..x + 16], [u8; 16]);
            let w_h = _mm512_cvtepu8_epi32(wh_bytes);
            let w_h_inv = _mm512_sub_epi32(c256, w_h);

            let pred = _mm512_add_epi32(
                _mm512_mullo_epi32(w_h, left_vec),
                _mm512_mullo_epi32(w_h_inv, right_vec),
            );
            let result = _mm512_srai_epi32::<8>(_mm512_add_epi32(pred, rounding));

            let clamped = _mm512_max_epi32(result, zero_512);
            let result_u16: __m256i = _mm512_cvtusepi32_epi16(clamped);
            let off = row_off + x * 2;
            storeu_256!(&mut dst[off..off + 32], [u8; 32], result_u16);

            x += 16;
        }

        while x < width {
            let w_h = weights_hor[x] as i32;
            let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
            x += 1;
        }
    }
}

/// PAETH prediction for 16bpc
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_paeth_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let topleft_val = u16::from_ne_bytes(topleft[tl_off..tl_off + 2].try_into().unwrap()) as i32;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;

        // Process each pixel - PAETH is complex so use scalar
        for x in 0..width {
            let top_byte_off = tl_off + (x + 1) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;

            // PAETH: pick closest of left, top, topleft to (left + top - topleft)
            let base = left_val + top_val - topleft_val;
            let l_diff = (left_val - base).abs();
            let t_diff = (top_val - base).abs();
            let tl_diff = (topleft_val - base).abs();

            let pred = if l_diff <= t_diff && l_diff <= tl_diff {
                left_val
            } else if t_diff <= tl_diff {
                top_val
            } else {
                topleft_val
            };

            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_paeth_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_paeth_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// SMOOTH prediction for 16bpc
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let right_off = tl_off + width * 2;
    let right_val =
        u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;
    let bottom_off = tl_off - height * 2;
    let bottom_val =
        u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;
        let w_v = weights_ver[y] as i32;

        for x in 0..width {
            let top_byte_off = tl_off + (1 + x) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;
            let w_h = weights_hor[x] as i32;

            // Vertical component: w_v * top + (256 - w_v) * bottom
            let vert = w_v * top_val + (256 - w_v) * bottom_val;
            // Horizontal component: w_h * left + (256 - w_h) * right
            let horz = w_h * left_val + (256 - w_h) * right_val;
            // Combine with rounding
            let pred = (vert + horz + 256) >> 9;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_smooth_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// SMOOTH_V prediction for 16bpc
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_v_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_ver = &dav1d_sm_weights[height..][..height];
    let bottom_off = tl_off - height * 2;
    let bottom_val =
        u16::from_ne_bytes(topleft[bottom_off..bottom_off + 2].try_into().unwrap()) as i32;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let w_v = weights_ver[y] as i32;

        for x in 0..width {
            let top_byte_off = tl_off + (1 + x) * 2;
            let top_val =
                u16::from_ne_bytes(topleft[top_byte_off..top_byte_off + 2].try_into().unwrap())
                    as i32;
            let pred = (w_v * top_val + (256 - w_v) * bottom_val + 128) >> 8;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_v_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_smooth_v_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

/// SMOOTH_H prediction for 16bpc
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_smooth_h_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let weights_hor = &dav1d_sm_weights[width..][..width];
    let right_off = tl_off + width * 2;
    let right_val =
        u16::from_ne_bytes(topleft[right_off..right_off + 2].try_into().unwrap()) as i32;

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let left_byte_off = tl_off - (y + 1) * 2;
        let left_val = u16::from_ne_bytes(
            topleft[left_byte_off..left_byte_off + 2]
                .try_into()
                .unwrap(),
        ) as i32;

        for x in 0..width {
            let w_h = weights_hor[x] as i32;
            let pred = (w_h * left_val + (256 - w_h) * right_val + 128) >> 8;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(pred as u16).to_ne_bytes());
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_smooth_h_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    _angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_smooth_h_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
    );
}

// ============================================================================
// Z1 Prediction 16bpc (angular prediction for angles < 90)
// ============================================================================

/// Z1 prediction for 16bpc: directional prediction using top edge only (angles < 90°)
/// Builds preprocessed top edge array internally, handles all cases.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z1_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dx = dav1d_dr_intra_derivative[(angle >> 1) as usize] as i32;

    // tl_off is in bytes, convert to pixel offset
    let tl_pix = tl_off / 2;

    // Helper: read u16 pixel from byte slice at pixel offset
    let rd = |off: usize| -> u16 {
        let b = off * 2;
        u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
    };

    let upsample_above = enable_intra_edge_filter
        && (90 - angle) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    // Build preprocessed top edge array as u16 pixels
    let mut top_px = [0u16; 64 + 64];
    let (max_base_x, base_inc);

    if upsample_above {
        let kernel: [i8; 4] = [-1, 9, 9, -1];
        let hsz = width_i + height_i;
        let in_off = tl_pix + 1;
        let from = -1i32;
        let to = width_i + std::cmp::min(width_i, height_i);
        for i in 0..hsz - 1 {
            top_px[(i * 2) as usize] =
                rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
            let mut s = 0i32;
            for j in 0..4i32 {
                s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
                    as i32
                    * kernel[j as usize] as i32;
            }
            top_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
        }
        let i = hsz - 1;
        top_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
        dx <<= 1;
        max_base_x = (2 * (width_i + height_i) - 2) as usize;
        base_inc = 2usize;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 90 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
            let in_off = tl_pix + 1;
            let from = -1i32;
            let to = width_i + std::cmp::min(width_i, height_i);
            let lim_from = 0i32;
            let lim_to = width_i + height_i;
            let mut i = 0i32;
            while i < std::cmp::min(width_i + height_i, lim_from) {
                top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            while i < std::cmp::min(lim_to, width_i + height_i) {
                let mut s = 0i32;
                for j in 0..5i32 {
                    s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
                        as i32
                        * KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
                }
                top_px[i as usize] = ((s + 8) >> 4) as u16;
                i += 1;
            }
            while i < width_i + height_i {
                top_px[i as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            max_base_x = (width_i + height_i - 1) as usize;
        } else {
            // No preprocessing — copy top pixels directly
            for i in 0..width + std::cmp::min(width, height) {
                top_px[i] = rd(tl_pix + 1 + i);
            }
            max_base_x = width + std::cmp::min(width, height) - 1;
        }
        base_inc = 1;
    };

    // Convert top_px to bytes for SIMD access
    let top_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(&top_px[..]);
    let top_bytes = top_bytes.flex();

    let rounding = _mm256_set1_epi32(32);

    for y in 0..height_i {
        let xpos = (y + 1) * dx;
        let frac = (xpos & 0x3e) as i32;
        let inv_frac = 64 - frac;

        let frac_vec = _mm256_set1_epi32(frac);
        let inv_frac_vec = _mm256_set1_epi32(inv_frac);

        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let base0 = (xpos >> 6) as usize;

        let mut x = 0usize;

        // SIMD: 8 pixels at a time (non-upsampled consecutive access)
        if base_inc == 1 {
            while x + 8 <= width && base0 + x + 8 < max_base_x {
                let base = base0 + x;

                let load0 = base * 2;
                let load1 = (base + 1) * 2;
                let t0 = loadu_128!((&top_bytes[load0..load0 + 16]), [u8; 16]);
                let t1 = loadu_128!((&top_bytes[load1..load1 + 16]), [u8; 16]);

                let t0_w = _mm256_cvtepu16_epi32(t0);
                let t1_w = _mm256_cvtepu16_epi32(t1);

                let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
                let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
                let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
                let result = _mm256_srai_epi32::<6>(sum);

                let packed = _mm256_packus_epi32(result, result);
                let lo = _mm256_castsi256_si128(packed);
                let hi = _mm256_extracti128_si256::<1>(packed);
                let combined = _mm_unpacklo_epi64(lo, hi);
                let store_off = row_off + x * 2;
                storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);

                x += 8;
            }
        }

        // Scalar remainder (also handles upsampled stride-2 access)
        while x < width {
            let base = base0 + base_inc * x;
            if base < max_base_x {
                let t0 = top_px[base] as i32;
                let t1 = top_px[base + 1] as i32;
                let v = t0 * inv_frac + t1 * frac;
                let off = row_off + x * 2;
                dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
            } else {
                let fill_val = top_px[max_base_x];
                for xx in x..width {
                    let off = row_off + xx * 2;
                    dst[off..off + 2].copy_from_slice(&fill_val.to_ne_bytes());
                }
                break;
            }
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z1_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_z1_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
        _bitdepth_max as i32,
    );
}

// ============================================================================
// Z2 Prediction 16bpc (angular prediction for angles 90-180) { return false; }
// ============================================================================

/// Z2 intra prediction SIMD inner for 16bpc.
/// Builds preprocessed edge array internally, handles all cases.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z2_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
    max_width: i32,
    max_height: i32,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((angle - 90) >> 1) as usize] as i32;
    let mut dx = dav1d_dr_intra_derivative[((180 - angle) >> 1) as usize] as i32;

    let upsample_left = enable_intra_edge_filter
        && (180 - angle) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);
    let upsample_above = enable_intra_edge_filter
        && (angle - 90) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    // Build preprocessed edge array as u16 pixels
    let mut edge_px = [0u16; 64 + 64 + 1];
    let edge_tl = 64usize;

    // Helper: read u16 pixel from byte slice at pixel offset
    let rd = |off: usize| -> u16 {
        let b = off * 2;
        u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
    };
    // tl_off is in bytes, convert to pixel offset
    let tl_pix = tl_off / 2;

    // Top edge preprocessing
    if upsample_above {
        let kernel: [i8; 4] = [-1, 9, 9, -1];
        let hsz = width_i + 1;
        let in_off = tl_pix;
        for i in 0..hsz - 1 {
            edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
            let mut s = 0i32;
            for j in 0..4i32 {
                s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
                    * kernel[j as usize] as i32;
            }
            edge_px[edge_tl + (i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
        }
        let i = hsz - 1;
        edge_px[edge_tl + (i * 2) as usize] = rd(in_off + i.clamp(0, hsz - 1) as usize);
        dx <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 90, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            // Filtered top edge
            static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
            let in_off = tl_pix + 1;
            let from = -1i32;
            let to = width_i;
            let lim_from = 0i32;
            let lim_to = max_width;
            let mut i = 0i32;
            while i < std::cmp::min(width_i, lim_from) {
                edge_px[edge_tl + 1 + i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            while i < std::cmp::min(lim_to, width_i) {
                let mut s = 0i32;
                for j in 0..5i32 {
                    s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
                        as i32
                        * KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
                }
                edge_px[edge_tl + 1 + i as usize] = ((s + 8) >> 4) as u16;
                i += 1;
            }
            while i < width_i {
                edge_px[edge_tl + 1 + i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
        } else {
            for i in 0..width {
                edge_px[edge_tl + 1 + i] = rd(tl_pix + 1 + i);
            }
        }
    }

    // Left edge preprocessing
    if upsample_left {
        let kernel: [i8; 4] = [-1, 9, 9, -1];
        let hsz = height_i + 1;
        let in_off = tl_pix - height;
        for i in 0..hsz - 1 {
            edge_px[edge_tl - height * 2 + (i * 2) as usize] =
                rd(in_off + i.clamp(0, hsz - 1) as usize);
            let mut s = 0i32;
            for j in 0..4i32 {
                s += rd(in_off + (i + j - 1).clamp(0, hsz - 1) as usize) as i32
                    * kernel[j as usize] as i32;
            }
            edge_px[edge_tl - height * 2 + (i * 2 + 1) as usize] =
                ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
        }
        let i = hsz - 1;
        edge_px[edge_tl - height * 2 + (i * 2) as usize] =
            rd(in_off + i.clamp(0, hsz - 1) as usize);
        dy <<= 1;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, 180 - angle, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
            let in_off = tl_pix - height;
            let from = 0i32;
            let to = height_i + 1;
            let lim_from = height_i - max_height;
            let lim_to = height_i;
            let mut i = 0i32;
            while i < std::cmp::min(height_i, lim_from) {
                edge_px[edge_tl - height + i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            while i < std::cmp::min(lim_to, height_i) {
                let mut s = 0i32;
                for j in 0..5i32 {
                    s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
                        as i32
                        * KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
                }
                edge_px[edge_tl - height + i as usize] = ((s + 8) >> 4) as u16;
                i += 1;
            }
            while i < height_i {
                edge_px[edge_tl - height + i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
        } else {
            for i in 0..height {
                edge_px[edge_tl - height + i] = rd(tl_pix - height + i);
            }
        }
    }

    // Corner pixel
    edge_px[edge_tl] = rd(tl_pix);

    // Convert to bytes for SIMD access
    let edge_bytes: &[u8] = zerocopy::IntoBytes::as_bytes(edge_px.as_slice());
    let edge = edge_bytes.flex();

    let base_inc_x = 1 + upsample_above as usize;
    let left = edge_tl - (1 + upsample_left as usize);

    let rounding = _mm256_set1_epi32(32);

    for y in 0..height_i {
        let xpos = ((1 + upsample_above as i32) << 6) - dx * (y + 1);
        let base_x0 = xpos >> 6;
        let frac_x = (xpos & 0x3e) as i32;
        let inv_frac_x = 64 - frac_x;

        let row_off = (dst_base as isize + y as isize * stride) as usize;

        let left_count = if base_x0 >= 0 {
            0usize
        } else {
            let needed = (-base_x0) as usize;
            needed.div_ceil(base_inc_x).min(width)
        };

        // First: process pixels using left edge
        let mut x = 0usize;
        while x < left_count {
            let ypos = (y << (6 + upsample_left as i32)) - dy * (x as i32 + 1);
            let base_y = ypos >> 6;
            let frac_y = ypos & 0x3e;
            let inv_frac_y = 64 - frac_y;

            let l0_pix = left.wrapping_add_signed(-base_y as isize);
            let l1_pix = left.wrapping_add_signed(-(base_y + 1) as isize);
            let l0_off = l0_pix * 2;
            let l1_off = l1_pix * 2;
            let l0 = u16::from_ne_bytes(edge[l0_off..l0_off + 2].try_into().unwrap()) as i32;
            let l1 = u16::from_ne_bytes(edge[l1_off..l1_off + 2].try_into().unwrap()) as i32;
            let v = l0 * inv_frac_y + l1 * frac_y;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
            x += 1;
        }

        // Then: process pixels using top edge
        if base_inc_x == 1 {
            while x + 8 <= width {
                let base_x = (base_x0 + x as i32) as usize;
                let load0 = (edge_tl + base_x) * 2;
                let load1 = (edge_tl + base_x + 1) * 2;
                if load1 + 16 > edge.len() {
                    break;
                }
                let t0 = loadu_128!((&edge[load0..load0 + 16]), [u8; 16]);
                let t1 = loadu_128!((&edge[load1..load1 + 16]), [u8; 16]);

                let t0_w = _mm256_cvtepu16_epi32(t0);
                let t1_w = _mm256_cvtepu16_epi32(t1);

                let frac_vec = _mm256_set1_epi32(frac_x);
                let inv_frac_vec = _mm256_set1_epi32(inv_frac_x);

                let prod0 = _mm256_mullo_epi32(t0_w, inv_frac_vec);
                let prod1 = _mm256_mullo_epi32(t1_w, frac_vec);
                let sum = _mm256_add_epi32(_mm256_add_epi32(prod0, prod1), rounding);
                let result = _mm256_srai_epi32::<6>(sum);

                let packed = _mm256_packus_epi32(result, result);
                let lo = _mm256_castsi256_si128(packed);
                let hi = _mm256_extracti128_si256::<1>(packed);
                let combined = _mm_unpacklo_epi64(lo, hi);
                let store_off = row_off + x * 2;
                storeu_128!((&mut dst[store_off..store_off + 16]), [u8; 16], combined);

                x += 8;
            }
        }

        // Scalar remainder
        while x < width {
            let base_x = (base_x0 + (base_inc_x * x) as i32) as usize;
            let t0_off = (edge_tl + base_x) * 2;
            let t1_off = (edge_tl + base_x + 1) * 2;
            if t1_off + 2 > edge.len() {
                break;
            }
            let t0 = u16::from_ne_bytes(edge[t0_off..t0_off + 2].try_into().unwrap()) as i32;
            let t1 = u16::from_ne_bytes(edge[t1_off..t1_off + 2].try_into().unwrap()) as i32;
            let v = t0 * inv_frac_x + t1 * frac_x;
            let off = row_off + x * 2;
            dst[off..off + 2].copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
            x += 1;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z2_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    max_width: c_int,
    max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_z2_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
        max_width as i32,
        max_height as i32,
        _bitdepth_max as i32,
    );
}

// ============================================================================
// Z3 Prediction 16bpc (angular prediction for angles > 180) { return false; }
// ============================================================================

/// Z3 prediction for 16bpc: directional prediction using left edge only (angles > 180°)
/// Builds preprocessed left edge array internally, handles all cases.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_z3_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    angle: i32,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    let width_i = width as i32;
    let height_i = height as i32;

    // Extract angle flags
    let is_sm = (angle >> 9) & 1 != 0;
    let enable_intra_edge_filter = (angle >> 10) != 0;
    let angle = angle & 511;

    let mut dy = dav1d_dr_intra_derivative[((270 - angle) >> 1) as usize] as usize;

    // tl_off is in bytes, convert to pixel offset
    let tl_pix = tl_off / 2;

    // Helper: read u16 pixel from byte slice at pixel offset
    let rd = |off: usize| -> u16 {
        let b = off * 2;
        u16::from_ne_bytes(topleft[b..b + 2].try_into().unwrap())
    };

    let upsample_left = enable_intra_edge_filter
        && (angle - 180) < 40
        && (width_i + height_i) <= (16 >> is_sm as i32);

    // Build preprocessed left edge array as u16 pixels
    // left_px[left_off - base] = pixel at distance `base` from corner
    let mut left_px = [0u16; 64 + 64];
    let (left_off, max_base_y, base_inc);
    let use_left_px; // whether to use left_px or topleft directly

    if upsample_left {
        let kernel: [i8; 4] = [-1, 9, 9, -1];
        let hsz = width_i + height_i;
        let in_off = tl_pix - (width + height);
        let from = std::cmp::max(width_i - height_i, 0);
        let to = width_i + height_i + 1;
        for i in 0..hsz - 1 {
            left_px[(i * 2) as usize] =
                rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
            let mut s = 0i32;
            for j in 0..4i32 {
                s += rd(in_off.wrapping_add_signed((i + j - 1).clamp(from, to - 1) as isize))
                    as i32
                    * kernel[j as usize] as i32;
            }
            left_px[(i * 2 + 1) as usize] = ((s + 8) >> 4).clamp(0, bitdepth_max) as u16;
        }
        let i = hsz - 1;
        left_px[(i * 2) as usize] = rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
        left_off = (2 * (width_i + height_i) - 2) as usize;
        max_base_y = left_off;
        dy <<= 1;
        base_inc = 2usize;
        use_left_px = true;
    } else {
        let filter_strength = if enable_intra_edge_filter {
            get_filter_strength_simple(width_i + height_i, angle - 180, is_sm)
        } else {
            0
        };
        if filter_strength != 0 {
            static KERNEL: [[u8; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]];
            let in_off = tl_pix - (width + height);
            let from = std::cmp::max(width_i - height_i, 0);
            let to = width_i + height_i + 1;
            let lim_from = 0i32;
            let lim_to = width_i + height_i;
            let mut i = 0i32;
            while i < std::cmp::min(width_i + height_i, lim_from) {
                left_px[i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            while i < std::cmp::min(lim_to, width_i + height_i) {
                let mut s = 0i32;
                for j in 0..5i32 {
                    s += rd(in_off.wrapping_add_signed((i - 2 + j).clamp(from, to - 1) as isize))
                        as i32
                        * KERNEL[(filter_strength - 1) as usize][j as usize] as i32;
                }
                left_px[i as usize] = ((s + 8) >> 4) as u16;
                i += 1;
            }
            while i < width_i + height_i {
                left_px[i as usize] =
                    rd(in_off.wrapping_add_signed(i.clamp(from, to - 1) as isize));
                i += 1;
            }
            left_off = (width_i + height_i - 1) as usize;
            max_base_y = left_off;
            use_left_px = true;
        } else {
            // No preprocessing — access topleft directly
            left_off = 0; // unused for direct topleft access
            max_base_y = height + std::cmp::min(width, height) - 1;
            use_left_px = false;
        }
        base_inc = 1;
    };

    // Column-major access pattern
    for x in 0..width {
        let ypos = dy * (x + 1);
        let frac = (ypos & 0x3e) as i32;
        let inv_frac = 64 - frac;

        for y in 0..height_i {
            let base = (ypos >> 6) + base_inc * y as usize;

            if base < max_base_y {
                let (l0, l1) = if use_left_px {
                    (
                        left_px[left_off - base] as i32,
                        left_px[left_off - base - 1] as i32,
                    )
                } else {
                    // Direct topleft: left[base] = tl[-(base+1)] in pixel units
                    (rd(tl_pix - base - 1) as i32, rd(tl_pix - base - 2) as i32)
                };
                let v = l0 * inv_frac + l1 * frac;
                let pixel_off = (dst_base as isize + y as isize * stride) as usize + x * 2;
                dst[pixel_off..pixel_off + 2]
                    .copy_from_slice(&(((v + 32) >> 6) as u16).to_ne_bytes());
            } else {
                let fill_val = if use_left_px {
                    left_px[left_off - max_base_y]
                } else {
                    rd(tl_pix - max_base_y - 1)
                };
                for yy in y..height_i {
                    let pixel_off = (dst_base as isize + yy as isize * stride) as usize + x * 2;
                    dst[pixel_off..pixel_off + 2].copy_from_slice(&fill_val.to_ne_bytes());
                }
                break;
            }
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_z3_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    angle: c_int,
    _max_width: c_int,
    _max_height: c_int,
    _bitdepth_max: c_int,
    _topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_z3_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        angle as i32,
        _bitdepth_max as i32,
    );
}

// ============================================================================
// FILTER Prediction 16bpc
// ============================================================================

/// FILTER prediction for 16bpc: uses 7-tap filter for intra prediction
///
/// Processes in 4x2 blocks. Each output pixel uses 7 input samples.
/// Input pixels: p0 = topleft, p1-p4 = top row (4 pixels), p5-p6 = left column (2 pixels) { return false; }
/// For 16bpc: out = (sum + 8) >> 4, clamped to [0, bitdepth_max]
#[cfg(target_arch = "x86_64")]
#[arcane]
fn ipred_filter_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    topleft: &[u8],
    tl_off: usize,
    width: usize,
    height: usize,
    filt_idx: i32,
    bitdepth_max: i32,
    topleft_off: usize,
) {
    let mut dst = dst.flex_mut();
    let topleft = topleft.flex();
    let width = (width as usize / 4) * 4; // Round down to multiple of 4
    let filt_idx = (filt_idx as usize) & 511;

    let filter = &dav1d_filter_intra_taps[filt_idx];

    // Process in 4x2 blocks
    for y in (0..height).step_by(2) {
        let cur_tl_off = topleft_off - y;
        // tl_pixel = topleft at byte offset for pixel position cur_tl_off
        let tl_pixel_off = tl_off.wrapping_add(cur_tl_off * 2);
        let mut tl_pixel =
            u16::from_ne_bytes(topleft[tl_pixel_off..tl_pixel_off + 2].try_into().unwrap()) as i32;

        let row0_off = (dst_base as isize + y as isize * stride) as usize;
        let row1_off = (dst_base as isize + (y + 1) as isize * stride) as usize;

        for x in (0..width).step_by(4) {
            // Get top 4 pixels (p1-p4)
            // y=0: from topleft buffer; y>=2: from previously-written output row y-1
            let (p1, p2, p3, p4) = if y == 0 {
                let top_base = tl_off.wrapping_add((topleft_off + 1 + x) * 2);
                (
                    u16::from_ne_bytes(topleft[top_base..top_base + 2].try_into().unwrap()) as i32,
                    u16::from_ne_bytes(topleft[top_base + 2..top_base + 4].try_into().unwrap())
                        as i32,
                    u16::from_ne_bytes(topleft[top_base + 4..top_base + 6].try_into().unwrap())
                        as i32,
                    u16::from_ne_bytes(topleft[top_base + 6..top_base + 8].try_into().unwrap())
                        as i32,
                )
            } else {
                let top_row = (dst_base as isize + (y as isize - 1) * stride) as usize;
                let tb = top_row + x * 2;
                (
                    u16::from_ne_bytes(dst[tb..tb + 2].try_into().unwrap()) as i32,
                    u16::from_ne_bytes(dst[tb + 2..tb + 4].try_into().unwrap()) as i32,
                    u16::from_ne_bytes(dst[tb + 4..tb + 6].try_into().unwrap()) as i32,
                    u16::from_ne_bytes(dst[tb + 6..tb + 8].try_into().unwrap()) as i32,
                )
            };

            // Get left 2 pixels (p5, p6)
            let (p5, p6) = if x == 0 {
                // From original topleft buffer
                let left_base = tl_off.wrapping_add(cur_tl_off.wrapping_sub(1) * 2);
                let left_base2 = tl_off.wrapping_add(cur_tl_off.wrapping_sub(2) * 2);
                (
                    u16::from_ne_bytes(topleft[left_base..left_base + 2].try_into().unwrap())
                        as i32,
                    u16::from_ne_bytes(topleft[left_base2..left_base2 + 2].try_into().unwrap())
                        as i32,
                )
            } else {
                // From previously computed output
                let p5_off = row0_off + (x - 1) * 2;
                let p6_off = row1_off + (x - 1) * 2;
                (
                    u16::from_ne_bytes(dst[p5_off..p5_off + 2].try_into().unwrap()) as i32,
                    u16::from_ne_bytes(dst[p6_off..p6_off + 2].try_into().unwrap()) as i32,
                )
            };

            let p0 = tl_pixel;
            let p = [p0, p1, p2, p3, p4, p5, p6];

            // Process 4x2 = 8 output pixels using filter taps
            let flt = filter.as_slice();
            let mut flt_offset = 0;

            // Row 0 (4 pixels)
            for xx in 0..4 {
                let acc = filter_fn(&flt[flt_offset..], p);
                let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
                let off = row0_off + (x + xx) * 2;
                dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
                flt_offset += FLT_INCR;
            }

            // Row 1 (4 pixels)
            for xx in 0..4 {
                let acc = filter_fn(&flt[flt_offset..], p);
                let val = ((acc + 8) >> 4).clamp(0, bitdepth_max as i32) as u16;
                let off = row1_off + (x + xx) * 2;
                dst[off..off + 2].copy_from_slice(&val.to_ne_bytes());
                flt_offset += FLT_INCR;
            }

            // Update topleft for next 4x2 block (16bpc)
            tl_pixel = p4;
        }
    }
}

#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn ipred_filter_16bpc_avx2(
    dst_ptr: *mut DynPixel,
    stride: ptrdiff_t,
    topleft: *const DynPixel,
    width: c_int,
    height: c_int,
    filt_idx: c_int,
    _max_width: c_int,
    _max_height: c_int,
    bitdepth_max: c_int,
    topleft_off: usize,
    _dst: *const FFISafe<PicOffset>,
) {
    let token = unsafe { Desktop64::forge_token_dangerously() };
    let buf_len = compute_ipred_buf_len(stride as isize, width as usize * 2, height as usize);
    let dst_sl = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
    let (tl_sl, tl_off) = compute_topleft_slice(
        topleft as *const u8,
        width as usize * 2,
        height as usize * 2,
    );
    ipred_filter_16bpc_inner(
        token,
        dst_sl,
        0,
        stride as isize,
        tl_sl,
        tl_off,
        width as usize,
        height as usize,
        filt_idx as i32,
        bitdepth_max as i32,
        topleft_off,
    );
}

// ============================================================================
// CFL Prediction (chroma-from-luma) — SIMD
// ============================================================================

/// CFL prediction for 8bpc — AVX2.
///
/// For each pixel: `dst = clip(dc + apply_sign((|alpha*ac| + 32) >> 6, alpha*ac), 0, 255)`.
/// Uses i32 SIMD (8 pixels per chunk) since `alpha*ac` can exceed i16 range for HBD.
/// Here we share the same code shape for both bit depths (caller selects the inner fn).
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_pred_8bpc_inner(
    _token: Desktop64,
    dst: &mut [u8],
    dst_base: usize,
    stride: isize,
    ac: &[i16],
    width: usize,
    height: usize,
    dc: i32,
    alpha: i32,
) {
    let mut dst = dst.flex_mut();
    let alpha_v = _mm256_set1_epi32(alpha);
    let dc_v = _mm256_set1_epi32(dc);
    let c32 = _mm256_set1_epi32(32);

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let ac_off = y * width;

        if width >= 8 {
            let mut x = 0;
            while x + 8 <= width {
                // Load 8 i16, widen to 8 i32
                let ac128 = loadu_128!(&ac[ac_off + x..ac_off + x + 8], [i16; 8]);
                let ac32 = _mm256_cvtepi16_epi32(ac128);

                let diff = _mm256_mullo_epi32(ac32, alpha_v);
                let abs_diff = _mm256_abs_epi32(diff);
                let plus32 = _mm256_add_epi32(abs_diff, c32);
                let shifted = _mm256_srli_epi32::<6>(plus32);
                let signed = _mm256_sign_epi32(shifted, diff);
                let result = _mm256_add_epi32(dc_v, signed); // 8 i32

                // Pack 8 i32 -> 8 u8 with saturation [0, 255]
                // packus_epi32 lane-local: [r[0..3],r[0..3],r[4..7],r[4..7]] in 16 u16
                let p16 = _mm256_packus_epi32(result, result);
                // Move qword 2 (r[4..7]) into qword 1: imm = 0b_00_00_10_00
                let p16_ordered = _mm256_permute4x64_epi64::<0b_00_00_10_00>(p16);
                let p16_lo = _mm256_castsi256_si128(p16_ordered); // 8 u16 in order
                let p8 = _mm_packus_epi16(p16_lo, p16_lo); // low 8 bytes = result
                // Store low 8 bytes
                let dst_chunk: &mut [u8; 8] =
                    (&mut dst[row_off + x..row_off + x + 8]).try_into().unwrap();
                safe_unaligned_simd::x86_64::_mm_storeu_si64(dst_chunk, p8);
                x += 8;
            }
        } else if width == 4 {
            // Load 4 i16 into a padded buffer (zero high lanes)
            let mut buf = [0i16; 8];
            buf[..4].copy_from_slice(&ac[ac_off..ac_off + 4]);
            let ac128 = loadu_128!(&buf);
            let ac32 = _mm256_cvtepi16_epi32(ac128);

            let diff = _mm256_mullo_epi32(ac32, alpha_v);
            let abs_diff = _mm256_abs_epi32(diff);
            let plus32 = _mm256_add_epi32(abs_diff, c32);
            let shifted = _mm256_srli_epi32::<6>(plus32);
            let signed = _mm256_sign_epi32(shifted, diff);
            let result = _mm256_add_epi32(dc_v, signed);

            // Pack and grab low 4 bytes
            let p16 = _mm256_packus_epi32(result, result);
            let p16_lo = _mm256_castsi256_si128(p16); // First 4 u16 = r[0..3]
            let p8 = _mm_packus_epi16(p16_lo, p16_lo); // low 4 bytes = r[0..3]
            let dst_chunk: &mut [u8; 4] = (&mut dst[row_off..row_off + 4]).try_into().unwrap();
            safe_unaligned_simd::x86_64::_mm_storeu_si32(dst_chunk, p8);
        }
    }
}

/// CFL prediction for 16bpc — AVX2.
///
/// Same algorithm as 8bpc but output is u16; clip to `bitdepth_max`.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_pred_16bpc_inner(
    _token: Desktop64,
    dst: &mut [u8], // raw bytes; pairs interpreted as u16 little-endian
    dst_base: usize,
    stride: isize,
    ac: &[i16],
    width: usize,
    height: usize,
    dc: i32,
    alpha: i32,
    bitdepth_max: i32,
) {
    let mut dst = dst.flex_mut();
    let alpha_v = _mm256_set1_epi32(alpha);
    let dc_v = _mm256_set1_epi32(dc);
    let c32 = _mm256_set1_epi32(32);
    let max_v = _mm256_set1_epi32(bitdepth_max);
    let zero_v = _mm256_setzero_si256();

    for y in 0..height {
        let row_off = (dst_base as isize + y as isize * stride) as usize;
        let ac_off = y * width;

        if width >= 8 {
            let mut x = 0;
            while x + 8 <= width {
                let ac128 = loadu_128!(&ac[ac_off + x..ac_off + x + 8], [i16; 8]);
                let ac32 = _mm256_cvtepi16_epi32(ac128);

                let diff = _mm256_mullo_epi32(ac32, alpha_v);
                let abs_diff = _mm256_abs_epi32(diff);
                let plus32 = _mm256_add_epi32(abs_diff, c32);
                let shifted = _mm256_srli_epi32::<6>(plus32);
                let signed = _mm256_sign_epi32(shifted, diff);
                let mut result = _mm256_add_epi32(dc_v, signed);
                // Clip to [0, bitdepth_max]
                result = _mm256_max_epi32(result, zero_v);
                result = _mm256_min_epi32(result, max_v);

                // Pack 8 i32 -> 8 u16 (saturating to u16 range, already clipped to bitdepth_max).
                let p16 = _mm256_packus_epi32(result, result);
                let p16_ordered = _mm256_permute4x64_epi64::<0b_00_00_10_00>(p16);
                let p16_lo = _mm256_castsi256_si128(p16_ordered); // 8 u16 in order
                // Store 16 bytes (8 u16) at dst[row_off + x*2 ..]
                let byte_x = x * 2;
                let dst_chunk: &mut [u8; 16] = (&mut dst[row_off + byte_x..row_off + byte_x + 16])
                    .try_into()
                    .unwrap();
                storeu_128!(dst_chunk, p16_lo);
                x += 8;
            }
        } else if width == 4 {
            let mut buf = [0i16; 8];
            buf[..4].copy_from_slice(&ac[ac_off..ac_off + 4]);
            let ac128 = loadu_128!(&buf);
            let ac32 = _mm256_cvtepi16_epi32(ac128);

            let diff = _mm256_mullo_epi32(ac32, alpha_v);
            let abs_diff = _mm256_abs_epi32(diff);
            let plus32 = _mm256_add_epi32(abs_diff, c32);
            let shifted = _mm256_srli_epi32::<6>(plus32);
            let signed = _mm256_sign_epi32(shifted, diff);
            let mut result = _mm256_add_epi32(dc_v, signed);
            result = _mm256_max_epi32(result, zero_v);
            result = _mm256_min_epi32(result, max_v);

            let p16 = _mm256_packus_epi32(result, result);
            let p16_lo = _mm256_castsi256_si128(p16); // First 4 u16 = r[0..3]
            // Store 8 bytes (4 u16)
            let dst_chunk: &mut [u8; 8] = (&mut dst[row_off..row_off + 8]).try_into().unwrap();
            safe_unaligned_simd::x86_64::_mm_storeu_si64(dst_chunk, p16_lo);
        }
    }
}

// ============================================================================
// Safe dispatch wrapper for x86_64 AVX2
// ============================================================================

use crate::include::common::bitdepth::BitDepth;
use crate::src::internal::SCRATCH_EDGE_LEN;

/// Safe dispatch for intra prediction. Returns true if SIMD was used.
#[cfg(target_arch = "x86_64")]
pub fn intra_pred_dispatch<BD: BitDepth>(
    mode: usize,
    dst: PicOffset,
    topleft: &[BD::Pixel; SCRATCH_EDGE_LEN],
    topleft_off: usize,
    width: c_int,
    height: c_int,
    angle: c_int,
    max_width: c_int,
    max_height: c_int,
    bd: BD,
) -> bool {
    use crate::include::common::bitdepth::BPC;
    use zerocopy::IntoBytes;

    let Some(token) = crate::src::cpu::summon_avx2() else {
        return false;
    };

    // Try AVX-512 for modes that benefit from wider registers
    #[cfg(target_arch = "x86_64")]
    let avx512_token = crate::src::cpu::summon_avx512();
    #[cfg(not(target_arch = "x86_64"))]
    let avx512_token: Option<Server64> = None;

    // Try AVX-512ICL (v4x: VBMI vpermb/vpermi2b) for directional predictors.
    #[cfg(target_arch = "x86_64")]
    let avx512x_token = crate::src::cpu::summon_avx512x();

    let w = width as usize;
    let h = height as usize;
    let bd_c = bd.into_c();

    // Get byte-level views (safe via zerocopy IntoBytes)
    let tl_bytes: &[u8] = topleft.as_bytes();

    crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
        &dst,
        w,
        h,
        |dst_bytes, dst_base_bytes, byte_stride| {
            match (BD::BPC, mode) {
                (BPC::BPC8, 0) => {
                    if let Some(t512) = avx512_token {
                        ipred_dc_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 1) => {
                    if let Some(t512) = avx512_token {
                        ipred_v_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_v_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 2) => {
                    if let Some(t512) = avx512_token {
                        ipred_h_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_h_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 3) => {
                    if let Some(t512) = avx512_token {
                        ipred_dc_left_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_left_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 4) => {
                    if let Some(t512) = avx512_token {
                        ipred_dc_top_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_top_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 5) => {
                    if let Some(t512) = avx512_token {
                        ipred_dc_128_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_128_8bpc_inner(token, dst_bytes, dst_base_bytes, byte_stride, w, h)
                    }
                }
                (BPC::BPC8, 6) => {
                    if let Some(t512x) = avx512x_token {
                        ipred_z1_8bpc_v4x_inner(
                            t512x,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                        );
                    } else {
                        ipred_z1_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                        );
                    }
                }
                (BPC::BPC8, 7) => {
                    if let Some(t512x) = avx512x_token {
                        ipred_z2_8bpc_v4x_inner(
                            t512x,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                            max_width,
                            max_height,
                        );
                    } else {
                        ipred_z2_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                            max_width,
                            max_height,
                        );
                    }
                }
                (BPC::BPC8, 8) => {
                    if let Some(t512x) = avx512x_token {
                        ipred_z3_8bpc_v4x_inner(
                            t512x,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                        );
                    } else {
                        ipred_z3_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                            angle as i32,
                        );
                    }
                }
                (BPC::BPC8, 9) => {
                    if let Some(t512) = avx512_token {
                        ipred_smooth_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 10) => {
                    if let Some(t512) = avx512_token {
                        ipred_smooth_v_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_v_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 11) => {
                    if let Some(t512) = avx512_token {
                        ipred_smooth_h_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_h_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 12) => {
                    if let Some(t512) = avx512_token {
                        ipred_paeth_8bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    } else {
                        ipred_paeth_8bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            topleft_off,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC8, 13) => {
                    ipred_filter_8bpc_inner(
                        token,
                        dst_bytes,
                        dst_base_bytes,
                        byte_stride,
                        tl_bytes,
                        0, // tl_off: full array starts at 0
                        w,
                        h,
                        angle as i32,
                        topleft_off,
                    )
                }
                (BPC::BPC16, 0) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_dc_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 1) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_v_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_v_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 2) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_h_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_h_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 3) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_dc_left_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_left_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 4) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_dc_top_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_dc_top_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 5) => {
                    if let Some(t512) = avx512_token {
                        ipred_dc_128_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            w,
                            h,
                            bd_c as i32,
                        )
                    } else {
                        ipred_dc_128_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            w,
                            h,
                            bd_c as i32,
                        )
                    }
                }
                (BPC::BPC16, 6) => {
                    let tl_off_bytes = topleft_off * 2;
                    ipred_z1_16bpc_inner(
                        token,
                        dst_bytes,
                        dst_base_bytes,
                        byte_stride,
                        tl_bytes,
                        tl_off_bytes,
                        w,
                        h,
                        angle as i32,
                        bd_c,
                    );
                }
                (BPC::BPC16, 7) => {
                    let tl_off_bytes = topleft_off * 2;
                    ipred_z2_16bpc_inner(
                        token,
                        dst_bytes,
                        dst_base_bytes,
                        byte_stride,
                        tl_bytes,
                        tl_off_bytes,
                        w,
                        h,
                        angle as i32,
                        max_width,
                        max_height,
                        bd_c,
                    );
                }
                (BPC::BPC16, 8) => {
                    let tl_off_bytes = topleft_off * 2;
                    ipred_z3_16bpc_inner(
                        token,
                        dst_bytes,
                        dst_base_bytes,
                        byte_stride,
                        tl_bytes,
                        tl_off_bytes,
                        w,
                        h,
                        angle as i32,
                        bd_c,
                    );
                }
                (BPC::BPC16, 9) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_smooth_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 10) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_smooth_v_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_v_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 11) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_smooth_h_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_smooth_h_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 12) => {
                    let tl_off_bytes = topleft_off * 2;
                    if let Some(t512) = avx512_token {
                        ipred_paeth_16bpc_avx512_inner(
                            t512,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    } else {
                        ipred_paeth_16bpc_inner(
                            token,
                            dst_bytes,
                            dst_base_bytes,
                            byte_stride,
                            tl_bytes,
                            tl_off_bytes,
                            w,
                            h,
                        )
                    }
                }
                (BPC::BPC16, 13) => {
                    ipred_filter_16bpc_inner(
                        token,
                        dst_bytes,
                        dst_base_bytes,
                        byte_stride,
                        tl_bytes,
                        0, // tl_off: full array starts at 0
                        w,
                        h,
                        angle as i32,
                        bd_c as i32,
                        topleft_off,
                    )
                }
                _ => return false,
            }
            true
        },
    ) // with_pixel_guard_mut
}

/// Safe dispatch for CFL prediction. Returns true if SIMD was used.
///
/// Operates on `width * height` AC coefficients from `ac`, computing
/// `dst = clip(dc + signed_round((alpha * ac) / 64), 0, bitdepth_max)`.
#[cfg(target_arch = "x86_64")]
pub fn cfl_pred_dispatch<BD: BitDepth>(
    dst: PicOffset,
    width: c_int,
    height: c_int,
    dc: c_int,
    ac: &[i16],
    alpha: c_int,
    bd: BD,
) -> bool {
    use crate::include::common::bitdepth::AsPrimitive;
    use crate::include::common::bitdepth::BPC;

    let Some(token) = crate::src::cpu::summon_avx2() else {
        return false;
    };

    let w = width as usize;
    let h = height as usize;
    // Only handle widths we have SIMD paths for (4, 8, 16, 32, 64).
    if !(w == 4 || w == 8 || w == 16 || w == 32 || w == 64) {
        return false;
    }

    let ac_slice = &ac[..w * h];

    crate::include::dav1d::picture::with_pixel_guard_mut::<BD, _>(
        &dst,
        w,
        h,
        |dst_bytes, dst_base_bytes, byte_stride| match BD::BPC {
            BPC::BPC8 => cfl_pred_8bpc_inner(
                token,
                dst_bytes,
                dst_base_bytes,
                byte_stride,
                ac_slice,
                w,
                h,
                dc,
                alpha,
            ),
            BPC::BPC16 => cfl_pred_16bpc_inner(
                token,
                dst_bytes,
                dst_base_bytes,
                byte_stride,
                ac_slice,
                w,
                h,
                dc,
                alpha,
                bd.bitdepth_max().as_::<i32>(),
            ),
        },
    );
    true
}

// ============================================================================
// CFL AC (chroma-from-luma AC pre-pass) — SIMD
// ============================================================================
//
// For each chroma block, sum the corresponding luma subblocks (2x2 for 4:2:0,
// 1x2 for 4:2:2, 1x1 for 4:4:4), pad right/bottom edges, compute the mean,
// and subtract the mean from each entry. Result feeds `cfl_pred_*`.
//
// Layout: `ac[width * height]` row-major. After computation, sum(ac) == 0.
//
// Max ac magnitude before mean-subtract: 2040 (= 4*255 << 1 for 4:2:0,
// = 2*255 << 2 for 4:2:2, = 255 << 3 for 4:4:4). Comfortably fits i16.

/// CFL AC 4:2:0 8bpc inner kernel — AVX2.
///
/// `src_bytes`: luma byte slice (compact or original-layout).
/// `src_base`: byte offset of first source pixel within `src_bytes`.
/// `src_stride`: byte stride between source rows (signed; can be negative).
/// `ac`: output i16 buffer of length `width * height`.
/// `width`, `height`: chroma block dimensions (final ac layout).
/// `active_w`, `active_h`: in-bounds chroma extent (right/bottom edges may pad).
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_420_8bpc_inner(
    _token: Desktop64,
    ac: &mut [i16],
    width: usize,
    height: usize,
    active_w: usize,
    active_h: usize,
    src_bytes: &[u8],
    src_base: usize,
    src_stride: isize,
) {
    let ones = _mm256_set1_epi8(1);

    // Step 1: Pair-sum 2 rows of luma into 1 row of ac.
    // For each chroma row y in 0..active_h:
    //   For x in 0..active_w (stride 16 in SIMD chunks):
    //     load 32 u8 from row1, 32 u8 from row2
    //     maddubs(row1, ones) -> 16 i16 pair-sums (a[0]+a[1], a[2]+a[3], ...)
    //     maddubs(row2, ones) -> 16 i16 pair-sums
    //     sum -> 16 i16 (range 0..1020)
    //     shift left by 1 -> 16 i16 (range 0..2040)
    //     store to ac[y*width + x..]
    for y in 0..active_h {
        let aci = y * width;
        let row1_off = (src_base as isize + (2 * y) as isize * src_stride) as usize;
        let row2_off = (src_base as isize + (2 * y + 1) as isize * src_stride) as usize;

        let mut x = 0;
        // 16-chroma-pixel SIMD chunk (32 luma bytes per row).
        while x + 16 <= active_w {
            let lx = 2 * x;
            let r1 = loadu_256!(
                <&[u8; 32]>::try_from(&src_bytes[row1_off + lx..row1_off + lx + 32]).unwrap()
            );
            let r2 = loadu_256!(
                <&[u8; 32]>::try_from(&src_bytes[row2_off + lx..row2_off + lx + 32]).unwrap()
            );
            let s1 = _mm256_maddubs_epi16(r1, ones); // 16 i16, lane-local
            let s2 = _mm256_maddubs_epi16(r2, ones);
            let sum = _mm256_add_epi16(s1, s2);
            let shifted = _mm256_slli_epi16::<1>(sum);
            // Lane order: maddubs is lane-local so this stores 16 i16 in
            // chroma order (lane 0 covers chroma x..x+8, lane 1 covers x+8..x+16).
            storeu_256!(
                <&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
                shifted
            );
            x += 16;
        }
        // 8-chroma-pixel SIMD chunk (16 luma bytes per row).
        while x + 8 <= active_w {
            let lx = 2 * x;
            let r1 = loadu_128!(
                <&[u8; 16]>::try_from(&src_bytes[row1_off + lx..row1_off + lx + 16]).unwrap()
            );
            let r2 = loadu_128!(
                <&[u8; 16]>::try_from(&src_bytes[row2_off + lx..row2_off + lx + 16]).unwrap()
            );
            let ones128 = _mm_set1_epi8(1);
            let s1 = _mm_maddubs_epi16(r1, ones128);
            let s2 = _mm_maddubs_epi16(r2, ones128);
            let sum = _mm_add_epi16(s1, s2);
            let shifted = _mm_slli_epi16::<1>(sum);
            storeu_128!(
                <&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
                shifted
            );
            x += 8;
        }
        // Scalar tail for narrow widths (active_w == 4 with no leftover).
        while x < active_w {
            let lx = 2 * x;
            let a = src_bytes[row1_off + lx] as i32;
            let b = src_bytes[row1_off + lx + 1] as i32;
            let c = src_bytes[row2_off + lx] as i32;
            let d = src_bytes[row2_off + lx + 1] as i32;
            ac[aci + x] = ((a + b + c + d) << 1) as i16;
            x += 1;
        }

        // Right-edge padding: repeat last in-bounds chroma value.
        if active_w < width {
            let pad = ac[aci + active_w - 1];
            for x in active_w..width {
                ac[aci + x] = pad;
            }
        }
    }

    // Step 2: Bottom-edge padding (copy last in-bounds row down).
    if active_h < height {
        let src_row_start = (active_h - 1) * width;
        // Use a manual copy loop to keep ac borrowed mutably without split_at.
        for y in active_h..height {
            let dst_off = y * width;
            // Borrow tracker: ac is a single &mut slice; copy_within is safe.
            ac.copy_within(src_row_start..src_row_start + width, dst_off);
        }
    }

    // Step 3: Sum reduction -> mean.
    let n = width * height;
    let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
    let mut sum_i32 = 1i32 << log2sz >> 1; // round bias
    {
        // SIMD accumulator over the whole ac buffer.
        let mut acc = _mm256_setzero_si256();
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            // Widen i16 -> i32 in two halves, accumulate.
            let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
            let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
            acc = _mm256_add_epi32(acc, lo);
            acc = _mm256_add_epi32(acc, hi);
            i += 16;
        }
        // Horizontal reduce acc (8 i32).
        let acc_lo = _mm256_castsi256_si128(acc);
        let acc_hi = _mm256_extracti128_si256::<1>(acc);
        let s128 = _mm_add_epi32(acc_lo, acc_hi);
        let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
        let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
        sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
        // Scalar tail (should be empty for width*height multiples of 16).
        while i < n {
            sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
            i += 1;
        }
    }
    let mean = (sum_i32 >> log2sz) as i16;

    // Step 4: Subtract mean from every entry.
    {
        let mean_v = _mm256_set1_epi16(mean);
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            let r = _mm256_sub_epi16(v, mean_v);
            storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
            i += 16;
        }
        // Scalar tail.
        while i < n {
            ac[i] = ac[i].wrapping_sub(mean);
            i += 1;
        }
    }
}

/// CFL AC 4:2:2 8bpc inner kernel — AVX2.
///
/// For 4:2:2: sum 1x2 luma pixels per chroma sample, shift left by 2.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_422_8bpc_inner(
    _token: Desktop64,
    ac: &mut [i16],
    width: usize,
    height: usize,
    active_w: usize,
    active_h: usize,
    src_bytes: &[u8],
    src_base: usize,
    src_stride: isize,
) {
    let ones = _mm256_set1_epi8(1);

    for y in 0..active_h {
        let aci = y * width;
        let row_off = (src_base as isize + y as isize * src_stride) as usize;

        let mut x = 0;
        while x + 16 <= active_w {
            let lx = 2 * x;
            let r1 = loadu_256!(
                <&[u8; 32]>::try_from(&src_bytes[row_off + lx..row_off + lx + 32]).unwrap()
            );
            let s1 = _mm256_maddubs_epi16(r1, ones);
            // 1x2 horizontal sum, then << 2.
            let shifted = _mm256_slli_epi16::<2>(s1);
            storeu_256!(
                <&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
                shifted
            );
            x += 16;
        }
        while x + 8 <= active_w {
            let lx = 2 * x;
            let r1 = loadu_128!(
                <&[u8; 16]>::try_from(&src_bytes[row_off + lx..row_off + lx + 16]).unwrap()
            );
            let ones128 = _mm_set1_epi8(1);
            let s1 = _mm_maddubs_epi16(r1, ones128);
            let shifted = _mm_slli_epi16::<2>(s1);
            storeu_128!(
                <&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
                shifted
            );
            x += 8;
        }
        while x < active_w {
            let lx = 2 * x;
            let a = src_bytes[row_off + lx] as i32;
            let b = src_bytes[row_off + lx + 1] as i32;
            ac[aci + x] = ((a + b) << 2) as i16;
            x += 1;
        }

        if active_w < width {
            let pad = ac[aci + active_w - 1];
            for x in active_w..width {
                ac[aci + x] = pad;
            }
        }
    }

    // Bottom padding + mean computation/subtract — identical to 4:2:0 path.
    if active_h < height {
        let src_row_start = (active_h - 1) * width;
        for y in active_h..height {
            let dst_off = y * width;
            ac.copy_within(src_row_start..src_row_start + width, dst_off);
        }
    }

    let n = width * height;
    let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
    let mut sum_i32 = 1i32 << log2sz >> 1;
    {
        let mut acc = _mm256_setzero_si256();
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
            let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
            acc = _mm256_add_epi32(acc, lo);
            acc = _mm256_add_epi32(acc, hi);
            i += 16;
        }
        let acc_lo = _mm256_castsi256_si128(acc);
        let acc_hi = _mm256_extracti128_si256::<1>(acc);
        let s128 = _mm_add_epi32(acc_lo, acc_hi);
        let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
        let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
        sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
        while i < n {
            sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
            i += 1;
        }
    }
    let mean = (sum_i32 >> log2sz) as i16;

    {
        let mean_v = _mm256_set1_epi16(mean);
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            let r = _mm256_sub_epi16(v, mean_v);
            storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
            i += 16;
        }
        while i < n {
            ac[i] = ac[i].wrapping_sub(mean);
            i += 1;
        }
    }
}

/// CFL AC 4:4:4 8bpc inner kernel — AVX2.
///
/// For 4:4:4: identity per pixel, shift left by 3.
#[cfg(target_arch = "x86_64")]
#[arcane]
fn cfl_ac_444_8bpc_inner(
    _token: Desktop64,
    ac: &mut [i16],
    width: usize,
    height: usize,
    active_w: usize,
    active_h: usize,
    src_bytes: &[u8],
    src_base: usize,
    src_stride: isize,
) {
    for y in 0..active_h {
        let aci = y * width;
        let row_off = (src_base as isize + y as isize * src_stride) as usize;

        let mut x = 0;
        while x + 16 <= active_w {
            let r1 = loadu_128!(
                <&[u8; 16]>::try_from(&src_bytes[row_off + x..row_off + x + 16]).unwrap()
            );
            // u8 -> i16 widen, then << 3.
            let widened = _mm256_cvtepu8_epi16(r1);
            let shifted = _mm256_slli_epi16::<3>(widened);
            storeu_256!(
                <&mut [i16; 16]>::try_from(&mut ac[aci + x..aci + x + 16]).unwrap(),
                shifted
            );
            x += 16;
        }
        while x + 8 <= active_w {
            // Load 8 u8, widen to 8 i16, shift.
            let arr: &[u8; 8] = (&src_bytes[row_off + x..row_off + x + 8])
                .try_into()
                .unwrap();
            // Use a stack-padded 16-byte load.
            let mut buf = [0u8; 16];
            buf[..8].copy_from_slice(arr);
            let r1 = loadu_128!(&buf);
            let widened = _mm_cvtepu8_epi16(r1);
            let shifted = _mm_slli_epi16::<3>(widened);
            storeu_128!(
                <&mut [i16; 8]>::try_from(&mut ac[aci + x..aci + x + 8]).unwrap(),
                shifted
            );
            x += 8;
        }
        while x < active_w {
            ac[aci + x] = (src_bytes[row_off + x] as i16) << 3;
            x += 1;
        }

        if active_w < width {
            let pad = ac[aci + active_w - 1];
            for x in active_w..width {
                ac[aci + x] = pad;
            }
        }
    }

    if active_h < height {
        let src_row_start = (active_h - 1) * width;
        for y in active_h..height {
            let dst_off = y * width;
            ac.copy_within(src_row_start..src_row_start + width, dst_off);
        }
    }

    let n = width * height;
    let log2sz = (width.trailing_zeros() + height.trailing_zeros()) as i32;
    let mut sum_i32 = 1i32 << log2sz >> 1;
    {
        let mut acc = _mm256_setzero_si256();
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            let lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v));
            let hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v));
            acc = _mm256_add_epi32(acc, lo);
            acc = _mm256_add_epi32(acc, hi);
            i += 16;
        }
        let acc_lo = _mm256_castsi256_si128(acc);
        let acc_hi = _mm256_extracti128_si256::<1>(acc);
        let s128 = _mm_add_epi32(acc_lo, acc_hi);
        let s64 = _mm_add_epi32(s128, _mm_shuffle_epi32::<0b_01_00_11_10>(s128));
        let s32 = _mm_add_epi32(s64, _mm_shuffle_epi32::<0b_00_00_00_01>(s64));
        sum_i32 = sum_i32.wrapping_add(_mm_cvtsi128_si32(s32));
        while i < n {
            sum_i32 = sum_i32.wrapping_add(ac[i] as i32);
            i += 1;
        }
    }
    let mean = (sum_i32 >> log2sz) as i16;

    {
        let mean_v = _mm256_set1_epi16(mean);
        let mut i = 0;
        while i + 16 <= n {
            let v = loadu_256!(<&[i16; 16]>::try_from(&ac[i..i + 16]).unwrap());
            let r = _mm256_sub_epi16(v, mean_v);
            storeu_256!(<&mut [i16; 16]>::try_from(&mut ac[i..i + 16]).unwrap(), r);
            i += 16;
        }
        while i < n {
            ac[i] = ac[i].wrapping_sub(mean);
            i += 1;
        }
    }
}

/// Safe dispatch for CFL AC. Returns true if SIMD was used.
///
/// 8bpc only for now; 16bpc returns false (caller falls back to scalar).
///
/// Single-threaded fast path: reads the source luma via `narrow_guard` (one
/// DisjointMut entry, no heap alloc — same shape as the cfl_pred dispatcher).
/// Tile-threading path: copies source into a per-row compact buffer for
/// MT-safe disjoint reads, matching the scalar `cfl_ac_rust` pattern.
#[cfg(target_arch = "x86_64")]
pub fn cfl_ac_dispatch<BD: BitDepth>(
    ac: &mut [i16],
    y_src: PicOffset,
    w_pad: c_int,
    h_pad: c_int,
    width: usize,
    height: usize,
    is_ss_hor: bool,
    is_ss_ver: bool,
) -> bool {
    use crate::include::common::bitdepth::BPC;
    use crate::include::dav1d::picture::tile_threading_active;
    use crate::src::strided::Strided as _;
    use zerocopy::IntoBytes;

    // 16bpc not yet supported.
    if BD::BPC != BPC::BPC8 {
        return false;
    }

    let Some(token) = crate::src::cpu::summon_avx2() else {
        return false;
    };

    let w_pad = (w_pad as usize) * 4;
    let h_pad = (h_pad as usize) * 4;
    debug_assert!(w_pad < width);
    debug_assert!(h_pad < height);
    let active_w = width - w_pad;
    let active_h = height - h_pad;
    let ss_hor = is_ss_hor as usize;
    let ss_ver = is_ss_ver as usize;
    let src_w = active_w << ss_hor;
    let src_h = active_h << ss_ver;

    let ac_block = &mut ac[..width * height];

    if tile_threading_active() {
        // MT-safe path: per-row guards into a compact buffer.
        let (src_compact, src_stride) = y_src.compact_read_per_row::<BD>(src_w, src_h);
        let src_stride_i = src_stride as isize;
        if is_ss_hor && is_ss_ver {
            cfl_ac_420_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                &src_compact,
                0,
                src_stride_i,
            );
        } else if is_ss_hor && !is_ss_ver {
            cfl_ac_422_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                &src_compact,
                0,
                src_stride_i,
            );
        } else {
            cfl_ac_444_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                &src_compact,
                0,
                src_stride_i,
            );
        }
    } else {
        // Single-threaded fast path: read directly from the picture buffer
        // via narrow_guard (no heap alloc, no memcpy).
        let (src_guard, src_base) = y_src.narrow_guard::<BD>(src_w, src_h);
        let src_bytes: &[u8] = src_guard.as_bytes();
        let src_stride_i = y_src.data.stride();
        if is_ss_hor && is_ss_ver {
            cfl_ac_420_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                src_bytes,
                src_base,
                src_stride_i,
            );
        } else if is_ss_hor && !is_ss_ver {
            cfl_ac_422_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                src_bytes,
                src_base,
                src_stride_i,
            );
        } else {
            cfl_ac_444_8bpc_inner(
                token,
                ac_block,
                width,
                height,
                active_w,
                active_h,
                src_bytes,
                src_base,
                src_stride_i,
            );
        }
    }
    true
}

// ============================================================================
// AVX-512ICL (v4x) directional predictor bit-exactness tests
// ============================================================================
//
// Compare the v4x (vpermb/vpermi2b) directional kernels against the AVX2
// reference kernel byte-for-byte. On a box that summons both Desktop64 and
// X64V4xToken (e.g. Zen 4) this runs the real kernels with real tokens -- no
// forge_token_dangerously, no unsafe. On a box without v4x the comparison is
// skipped (the v4x path is never taken there; AVX2 covers correctness, which
// decode_md5_verify checks end-to-end). The skip decision is visible here at
// the top of the test, not buried inside a kernel.
#[cfg(all(test, target_arch = "x86_64"))]
mod v4x_dir_tests {
    use super::*;

    // Topleft scratch buffer with deterministic pseudo-random samples. `tl_off`
    // sits well inside so both negative (left) and positive (top) reaches are
    // valid for any width/height up to 64.
    fn make_topleft() -> (Vec<u8>, usize) {
        let total = 512usize;
        let tl_off = 200usize;
        let mut buf = vec![0u8; total];
        let mut st: u32 = 0x1234_5678;
        for b in buf.iter_mut() {
            st ^= st << 13;
            st ^= st >> 17;
            st ^= st << 5;
            *b = (st >> 3) as u8;
        }
        (buf, tl_off)
    }

    fn run_z1(w: usize, h: usize, angle: i32) -> (Vec<u8>, Vec<u8>) {
        let (tl, tl_off) = make_topleft();
        let stride = 64isize;
        let mut dst_a = vec![7u8; 64 * 64];
        let mut dst_b = vec![7u8; 64 * 64];
        let t3 = crate::src::cpu::summon_avx2().expect("avx2");
        let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
        ipred_z1_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle);
        ipred_z1_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle);
        (dst_a, dst_b)
    }

    fn run_z3(w: usize, h: usize, angle: i32) -> (Vec<u8>, Vec<u8>) {
        let (tl, tl_off) = make_topleft();
        let stride = 64isize;
        let mut dst_a = vec![7u8; 64 * 64];
        let mut dst_b = vec![7u8; 64 * 64];
        let t3 = crate::src::cpu::summon_avx2().expect("avx2");
        let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
        ipred_z3_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle);
        ipred_z3_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle);
        (dst_a, dst_b)
    }

    #[allow(clippy::too_many_arguments)]
    fn run_z2(w: usize, h: usize, angle: i32, mw: i32, mh: i32) -> (Vec<u8>, Vec<u8>) {
        let (tl, tl_off) = make_topleft();
        let stride = 64isize;
        let mut dst_a = vec![7u8; 64 * 64];
        let mut dst_b = vec![7u8; 64 * 64];
        let t3 = crate::src::cpu::summon_avx2().expect("avx2");
        let t4x = crate::src::cpu::summon_avx512x().expect("v4x");
        ipred_z2_8bpc_inner(t3, &mut dst_a, 0, stride, &tl, tl_off, w, h, angle, mw, mh);
        ipred_z2_8bpc_v4x_inner(t4x, &mut dst_b, 0, stride, &tl, tl_off, w, h, angle, mw, mh);
        (dst_a, dst_b)
    }

    fn assert_block_eq(a: &[u8], b: &[u8], w: usize, h: usize, stride: usize, label: &str) {
        for y in 0..h {
            for x in 0..w {
                let off = y * stride + x;
                assert_eq!(
                    a[off], b[off],
                    "{label}: mismatch at ({x},{y}) avx2={} v4x={}",
                    a[off], b[off]
                );
            }
        }
    }

    #[test]
    fn z1_v4x_matches_avx2() {
        if crate::src::cpu::summon_avx512x().is_none() {
            eprintln!("z1_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
            return;
        }
        // Representative z1 base angles (< 90, non-zero derivative). Bit 1<<10
        // enables the intra-edge filter; 1<<9 selects smooth.
        let base_angles = [3, 6, 14, 22, 30, 36, 44, 52, 60, 66, 74, 82, 86];
        let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
        let dims = [
            (4, 4),
            (4, 8),
            (8, 4),
            (8, 8),
            (8, 16),
            (16, 8),
            (16, 16),
            (16, 32),
            (32, 16),
            (32, 32),
            (32, 64),
            (64, 32),
            (64, 64),
            (4, 16),
            (16, 4),
        ];
        for &(w, h) in &dims {
            for &ba in &base_angles {
                for &fl in &flag_sets {
                    let angle = ba | fl;
                    let (a, b) = run_z1(w, h, angle);
                    assert_block_eq(&a, &b, w, h, 64, &format!("z1 w={w} h={h} angle={angle}"));
                }
            }
        }
    }

    #[test]
    fn z3_v4x_matches_avx2() {
        if crate::src::cpu::summon_avx512x().is_none() {
            eprintln!("z3_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
            return;
        }
        // z3 base angles are 180..270 in encoding; the derivative index is
        // (270 - angle) >> 1, so sweep angles whose index hits non-zero entries.
        let base_angles = [184, 190, 198, 206, 214, 222, 230, 238, 246, 254, 262, 266];
        let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
        let dims = [
            (4, 4),
            (4, 8),
            (8, 4),
            (8, 8),
            (8, 16),
            (16, 8),
            (16, 16),
            (16, 32),
            (32, 16),
            (32, 32),
            (32, 64),
            (64, 32),
            (64, 64),
            (4, 16),
            (16, 4),
        ];
        for &(w, h) in &dims {
            for &ba in &base_angles {
                for &fl in &flag_sets {
                    let angle = ba | fl;
                    let (a, b) = run_z3(w, h, angle);
                    assert_block_eq(&a, &b, w, h, 64, &format!("z3 w={w} h={h} angle={angle}"));
                }
            }
        }
    }

    #[test]
    fn z2_v4x_matches_avx2() {
        if crate::src::cpu::summon_avx512x().is_none() {
            eprintln!("z2_v4x_matches_avx2: X64V4xToken unavailable, skipping (AVX2 path used)");
            return;
        }
        // z2 angles span 90..180. Index ((angle-90)>>1) and ((180-angle)>>1)
        // must hit valid derivative entries.
        let base_angles = [94, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 176];
        let flag_sets = [0i32, 1 << 10, (1 << 10) | (1 << 9)];
        let dims = [
            (4, 4),
            (4, 8),
            (8, 4),
            (8, 8),
            (8, 16),
            (16, 8),
            (16, 16),
            (16, 32),
            (32, 16),
            (32, 32),
            (32, 64),
            (64, 32),
            (64, 64),
            (4, 16),
            (16, 4),
        ];
        let mut compared = 0usize;
        for &(w, h) in &dims {
            for &ba in &base_angles {
                for &fl in &flag_sets {
                    let angle = ba | fl;
                    // Full block (max_width == width, max_height == height),
                    // matching the common decoder path. Some synthetic
                    // angle/dim combos exceed the edge-buffer reach that the
                    // real decoder guarantees and panic the *reference* kernel;
                    // those are not valid decoder inputs, so skip any config
                    // where the reference itself faults and only assert
                    // bit-equality on configs the reference accepts.
                    let (mw, mh) = (w as i32, h as i32);
                    let ref_ok = std::panic::catch_unwind(|| {
                        let (tl, tl_off) = make_topleft();
                        let mut d = vec![7u8; 64 * 64];
                        let t3 = crate::src::cpu::summon_avx2().expect("avx2");
                        ipred_z2_8bpc_inner(t3, &mut d, 0, 64, &tl, tl_off, w, h, angle, mw, mh);
                    })
                    .is_ok();
                    if !ref_ok {
                        continue;
                    }
                    let (a, b) = run_z2(w, h, angle, mw, mh);
                    assert_block_eq(
                        &a,
                        &b,
                        w,
                        h,
                        64,
                        &format!("z2 w={w} h={h} angle={angle} mw={mw} mh={mh}"),
                    );
                    compared += 1;
                }
            }
        }
        eprintln!("z2_v4x compared {compared} configs");
        assert!(
            compared >= 100,
            "z2 test compared too few configs: {compared}"
        );
    }
}