yscv-video 0.1.7

//! # H.265/HEVC Video Decoder
//!
//! Pure Rust implementation of the H.265/HEVC Main profile decoder.
//!
//! ## Architecture
//!
//! Implemented across 5 files (~6600 lines total):
//!
//! | File | Responsibility |
//! |------|---------------|
//! | `hevc_decoder.rs` | VPS/SPS/PPS parsing, CTU quad-tree, intra prediction (DC, planar, angular with fractional interpolation), inverse transforms (DST-4x4, DCT 4/8/16/32), dequantisation, top-level `HevcDecoder` with CABAC path |
//! | `hevc_cabac.rs` | Full CABAC arithmetic engine, context models, state transitions, binarization (TR, FL, unary, EGk) |
//! | `hevc_syntax.rs` | CU/PU/TU syntax parsing via CABAC: split_cu_flag, pred_mode, intra modes (MPM list), transform coeff parsing, residual decoding, reference sample construction |
//! | `hevc_inter.rs` | DPB, motion compensation, merge candidates, AMVP, MVD parsing, bi-prediction framework |
//! | `hevc_filter.rs` | Deblocking filter (boundary strength, tc/beta tables, luma/chroma edge filtering), SAO, chroma reconstruction, YCbCr-to-RGB conversion |
//!
//! ## Supported features
//! - I-slices (intra prediction: DC, planar, all 33 angular modes)
//! - CABAC entropy coding (full arithmetic engine with context adaptation)
//! - CTU quad-tree partitioning (up to 64x64 CTU size)
//! - Inverse transforms (DST-4x4, DCT 4x4/8x8/16x16/32x32)
//! - Deblocking filter with boundary strength calculation
//! - Sample Adaptive Offset (SAO) filtering framework
//! - VPS/SPS/PPS parameter set parsing
//! - YCbCr 4:2:0 to RGB8 conversion
//!
//! - P/B slice inter prediction with DPB reference frames
//! - Motion compensation (8-tap luma, merge/AMVP modes)
//! - Bi-prediction for B-slices
//!
//! ## Limitations
//! - 10/12-bit profiles not yet supported (Main 8-bit only)
//! - Weighted prediction not yet applied
//! - Tiles and WPP not supported
//! - No weighted prediction
//! - No Main 10 / Main 12 bit depth profiles
//! - No 4:2:2 or 4:4:4 chroma formats
//! - No WPP (Wavefront Parallel Processing)
//! - No dependent slice segments
//! - No tiles
//!
//! ## Error handling
//! Malformed bitstreams return `VideoError` instead of panicking.
//! However, this decoder has not been fuzz-tested and may not handle
//! all adversarial inputs gracefully. For production video pipelines
//! with untrusted input, consider FFI to libavcodec.
//!
//! ## End-to-end pipeline
//! NAL -> CABAC -> CU parse -> intra/inter pred -> residual
//! -> reconstruct -> deblock -> SAO -> chroma -> RGB output.

use super::h264_bitstream::BitstreamReader;
use crate::VideoError;

// ---------------------------------------------------------------------------
// HEVC NAL Unit Types
// ---------------------------------------------------------------------------

/// HEVC NAL unit type enumeration (ITU-T H.265, Table 7-1).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HevcNalUnitType {
    TrailN,
    TrailR,
    TsaN,
    TsaR,
    StsaN,
    StsaR,
    RadlN,
    RadlR,
    RaslN,
    RaslR,
    BlaWLp,
    BlaWRadl,
    BlaNLp,
    IdrWRadl,
    IdrNLp,
    CraNut,
    VpsNut,
    SpsNut,
    PpsNut,
    AudNut,
    EosNut,
    EobNut,
    FdNut,
    PrefixSeiNut,
    SuffixSeiNut,
    Other(u8),
}

impl HevcNalUnitType {
    pub fn from_header(header: &[u8]) -> Self {
        if header.is_empty() {
            return Self::Other(0);
        }
        Self::from_type_byte((header[0] >> 1) & 0x3F)
    }
    fn from_type_byte(t: u8) -> Self {
        match t {
            0 => Self::TrailN,
            1 => Self::TrailR,
            2 => Self::TsaN,
            3 => Self::TsaR,
            4 => Self::StsaN,
            5 => Self::StsaR,
            6 => Self::RadlN,
            7 => Self::RadlR,
            8 => Self::RaslN,
            9 => Self::RaslR,
            16 => Self::BlaWLp,
            17 => Self::BlaWRadl,
            18 => Self::BlaNLp,
            19 => Self::IdrWRadl,
            20 => Self::IdrNLp,
            21 => Self::CraNut,
            32 => Self::VpsNut,
            33 => Self::SpsNut,
            34 => Self::PpsNut,
            35 => Self::AudNut,
            36 => Self::EosNut,
            37 => Self::EobNut,
            38 => Self::FdNut,
            39 => Self::PrefixSeiNut,
            40 => Self::SuffixSeiNut,
            other => Self::Other(other),
        }
    }
    pub fn is_vcl(&self) -> bool {
        matches!(
            self,
            Self::TrailN
                | Self::TrailR
                | Self::TsaN
                | Self::TsaR
                | Self::StsaN
                | Self::StsaR
                | Self::RadlN
                | Self::RadlR
                | Self::RaslN
                | Self::RaslR
                | Self::BlaWLp
                | Self::BlaWRadl
                | Self::BlaNLp
                | Self::IdrWRadl
                | Self::IdrNLp
                | Self::CraNut
        )
    }
    pub fn is_idr(&self) -> bool {
        matches!(self, Self::IdrWRadl | Self::IdrNLp)
    }
}

// ---------------------------------------------------------------------------
// Video Parameter Set (VPS)
// ---------------------------------------------------------------------------

/// HEVC Video Parameter Set.
#[derive(Debug, Clone)]
pub struct HevcVps {
    pub vps_id: u8,
    pub max_layers: u8,
    pub max_sub_layers: u8,
    pub temporal_id_nesting: bool,
}

// ---------------------------------------------------------------------------
// Sequence Parameter Set (SPS)
// ---------------------------------------------------------------------------

/// HEVC Sequence Parameter Set.
#[derive(Debug, Clone)]
pub struct HevcSps {
    pub sps_id: u8,
    pub vps_id: u8,
    pub max_sub_layers: u8,
    pub chroma_format_idc: u8, // 0=mono, 1=4:2:0, 2=4:2:2, 3=4:4:4
    pub pic_width: u32,
    pub pic_height: u32,
    pub bit_depth_luma: u8,
    pub bit_depth_chroma: u8,
    pub log2_max_pic_order_cnt: u8,
    pub log2_min_cb_size: u8,
    pub log2_diff_max_min_cb_size: u8,
    pub log2_min_transform_size: u8,
    pub log2_diff_max_min_transform_size: u8,
    pub max_transform_hierarchy_depth_inter: u8,
    pub max_transform_hierarchy_depth_intra: u8,
    pub sample_adaptive_offset_enabled: bool,
    pub pcm_enabled: bool,
    pub num_short_term_ref_pic_sets: u8,
    pub long_term_ref_pics_present: bool,
    pub sps_temporal_mvp_enabled: bool,
    pub strong_intra_smoothing_enabled: bool,
}

// ---------------------------------------------------------------------------
// Picture Parameter Set (PPS)
// ---------------------------------------------------------------------------

/// HEVC Picture Parameter Set.
#[derive(Debug, Clone)]
pub struct HevcPps {
    pub pps_id: u8,
    pub sps_id: u8,
    pub dependent_slice_segments_enabled: bool,
    pub output_flag_present: bool,
    pub num_extra_slice_header_bits: u8,
    pub sign_data_hiding_enabled: bool,
    pub cabac_init_present: bool,
    pub num_ref_idx_l0_default: u8,
    pub num_ref_idx_l1_default: u8,
    pub init_qp: i8,
    pub constrained_intra_pred: bool,
    pub transform_skip_enabled: bool,
    pub cu_qp_delta_enabled: bool,
    pub cb_qp_offset: i8,
    pub cr_qp_offset: i8,
    pub deblocking_filter_override_enabled: bool,
    pub deblocking_filter_disabled: bool,
    pub loop_filter_across_slices_enabled: bool,
    pub tiles_enabled: bool,
    pub entropy_coding_sync_enabled: bool,
}

// ---------------------------------------------------------------------------
// Slice header & slice types
// ---------------------------------------------------------------------------

/// HEVC Slice Header (simplified).
#[derive(Debug, Clone)]
pub struct HevcSliceHeader {
    pub first_slice_in_pic: bool,
    pub slice_type: HevcSliceType,
    pub pps_id: u8,
    pub slice_qp_delta: i8,
    pub weight_table: Option<super::hevc_params::HevcWeightTable>,
}

/// HEVC slice types.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HevcSliceType {
    B = 0,
    P = 1,
    I = 2,
}

// ---------------------------------------------------------------------------
// Coding Tree Unit (CTU)
// ---------------------------------------------------------------------------

/// Coding Tree Unit — the basic processing unit in HEVC (replaces H.264 macroblock).
#[derive(Debug, Clone)]
pub struct CodingTreeUnit {
    pub x: usize,
    pub y: usize,
    pub size: usize, // CTU size (typically 64)
    pub qp: i8,
}

// ---------------------------------------------------------------------------
// VPS parsing
// ---------------------------------------------------------------------------

/// Parse HEVC VPS from NAL unit payload (after the 2-byte NAL header).
pub fn parse_hevc_vps(data: &[u8]) -> Result<HevcVps, VideoError> {
    let rbsp = super::h264_params::remove_emulation_prevention(data);
    let mut reader = BitstreamReader::new(&rbsp);
    let vps_id = reader.read_bits(4)? as u8;
    reader.read_bits(2)?; // vps_base_layer_internal_flag + vps_base_layer_available_flag
    let max_layers = reader.read_bits(6)? as u8 + 1;
    let max_sub_layers = reader.read_bits(3)? as u8 + 1;
    let temporal_id_nesting = reader.read_bit()? != 0;
    Ok(HevcVps {
        vps_id,
        max_layers,
        max_sub_layers,
        temporal_id_nesting,
    })
}

// ---------------------------------------------------------------------------
// Profile-tier-level skipping
// ---------------------------------------------------------------------------

/// Skip profile_tier_level syntax element.
fn skip_profile_tier_level(
    reader: &mut BitstreamReader,
    max_sub_layers: u8,
) -> Result<(), VideoError> {
    // general_profile_space(2) + general_tier_flag(1) + general_profile_idc(5) = 8 bits
    reader.read_bits(8)?;
    // general_profile_compatibility_flags (32 bits)
    reader.read_bits(16)?;
    reader.read_bits(16)?;
    // general_constraint_indicator_flags (48 bits)
    reader.read_bits(16)?;
    reader.read_bits(16)?;
    reader.read_bits(16)?;
    // general_level_idc (8 bits)
    reader.read_bits(8)?;
    // sub_layer flags (if max_sub_layers > 1)
    for _ in 1..max_sub_layers {
        reader.read_bits(2)?; // sub_layer_profile_present + sub_layer_level_present
    }
    if max_sub_layers > 1 {
        for _ in max_sub_layers..8 {
            reader.read_bits(2)?; // reserved zero 2 bits
        }
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// SPS parsing
// ---------------------------------------------------------------------------

/// Parse HEVC SPS from NAL unit payload (after the 2-byte NAL header).
pub fn parse_hevc_sps(data: &[u8]) -> Result<HevcSps, VideoError> {
    let rbsp = super::h264_params::remove_emulation_prevention(data);
    let mut reader = BitstreamReader::new(&rbsp);
    let vps_id = reader.read_bits(4)? as u8;
    let max_sub_layers = reader.read_bits(3)? as u8 + 1;
    let _temporal_id_nesting = reader.read_bit()?;

    // Skip profile_tier_level (simplified — fixed-length approximation)
    skip_profile_tier_level(&mut reader, max_sub_layers)?;

    let sps_id = reader.read_ue()? as u8;
    let chroma_format_idc = reader.read_ue()? as u8;
    if chroma_format_idc == 3 {
        reader.read_bit()?; // separate_colour_plane_flag
    }
    let pic_width = reader.read_ue()?;
    let pic_height = reader.read_ue()?;

    let conformance_window = reader.read_bit()? != 0;
    if conformance_window {
        reader.read_ue()?; // conf_win_left_offset
        reader.read_ue()?; // conf_win_right_offset
        reader.read_ue()?; // conf_win_top_offset
        reader.read_ue()?; // conf_win_bottom_offset
    }

    let bit_depth_luma = reader.read_ue()? as u8 + 8;
    let bit_depth_chroma = reader.read_ue()? as u8 + 8;
    let log2_max_pic_order_cnt = reader.read_ue()? as u8 + 4;

    // sub_layer_ordering_info_present_flag
    let sub_layer_ordering_info_present = reader.read_bit()? != 0;
    let start = if sub_layer_ordering_info_present {
        0
    } else {
        max_sub_layers - 1
    };
    for _ in start..max_sub_layers {
        reader.read_ue()?; // max_dec_pic_buffering_minus1
        reader.read_ue()?; // max_num_reorder_pics
        reader.read_ue()?; // max_latency_increase_plus1
    }

    let log2_min_cb_size = reader.read_ue()? as u8 + 3;
    let log2_diff_max_min_cb_size = reader.read_ue()? as u8;
    let log2_min_transform_size = reader.read_ue()? as u8 + 2;
    let log2_diff_max_min_transform_size = reader.read_ue()? as u8;
    let max_transform_hierarchy_depth_inter = reader.read_ue()? as u8;
    let max_transform_hierarchy_depth_intra = reader.read_ue()? as u8;

    // scaling_list_enabled_flag
    let scaling_list_enabled = reader.read_bit()? != 0;
    if scaling_list_enabled {
        let scaling_list_data_present = reader.read_bit()? != 0;
        if scaling_list_data_present {
            skip_scaling_list_data(&mut reader)?;
        }
    }

    // amp_enabled_flag, sample_adaptive_offset_enabled_flag
    let _amp_enabled = reader.read_bit()?;
    let sample_adaptive_offset_enabled = reader.read_bit()? != 0;

    // pcm_enabled_flag
    let pcm_enabled = reader.read_bit()? != 0;
    if pcm_enabled {
        // pcm_sample_bit_depth_luma_minus1 (4) + pcm_sample_bit_depth_chroma_minus1 (4)
        reader.read_bits(4)?;
        reader.read_bits(4)?;
        reader.read_ue()?; // log2_min_pcm_luma_coding_block_size_minus3
        reader.read_ue()?; // log2_diff_max_min_pcm_luma_coding_block_size
        reader.read_bit()?; // pcm_loop_filter_disabled_flag
    }

    let num_short_term_ref_pic_sets = reader.read_ue()? as u8;
    // Skip actual short-term ref pic set parsing (complex; fill defaults below)

    // For remaining flags that require parsing the ref pic sets first,
    // use conservative defaults.
    Ok(HevcSps {
        sps_id,
        vps_id,
        max_sub_layers,
        chroma_format_idc,
        pic_width,
        pic_height,
        bit_depth_luma,
        bit_depth_chroma,
        log2_max_pic_order_cnt,
        log2_min_cb_size,
        log2_diff_max_min_cb_size,
        log2_min_transform_size,
        log2_diff_max_min_transform_size,
        max_transform_hierarchy_depth_inter,
        max_transform_hierarchy_depth_intra,
        sample_adaptive_offset_enabled,
        pcm_enabled,
        num_short_term_ref_pic_sets,
        long_term_ref_pics_present: false,
        sps_temporal_mvp_enabled: false,
        strong_intra_smoothing_enabled: false,
    })
}

// ---------------------------------------------------------------------------
// PPS parsing
// ---------------------------------------------------------------------------

/// Parse HEVC PPS from NAL unit payload (after the 2-byte NAL header).
pub fn parse_hevc_pps(data: &[u8]) -> Result<HevcPps, VideoError> {
    let rbsp = super::h264_params::remove_emulation_prevention(data);
    let mut reader = BitstreamReader::new(&rbsp);

    let pps_id = reader.read_ue()? as u8;
    let sps_id = reader.read_ue()? as u8;
    let dependent_slice_segments_enabled = reader.read_bit()? != 0;
    let output_flag_present = reader.read_bit()? != 0;
    let num_extra_slice_header_bits = reader.read_bits(3)? as u8;
    let sign_data_hiding_enabled = reader.read_bit()? != 0;
    let cabac_init_present = reader.read_bit()? != 0;
    let num_ref_idx_l0_default = reader.read_ue()? as u8 + 1;
    let num_ref_idx_l1_default = reader.read_ue()? as u8 + 1;
    let init_qp_minus26 = reader.read_se()?;
    let init_qp = (26 + init_qp_minus26) as i8;
    let constrained_intra_pred = reader.read_bit()? != 0;
    let transform_skip_enabled = reader.read_bit()? != 0;
    let cu_qp_delta_enabled = reader.read_bit()? != 0;
    if cu_qp_delta_enabled {
        reader.read_ue()?; // diff_cu_qp_delta_depth
    }
    let cb_qp_offset = reader.read_se()? as i8;
    let cr_qp_offset = reader.read_se()? as i8;
    let _slice_chroma_qp_offsets_present = reader.read_bit()?;
    let _weighted_pred = reader.read_bit()?;
    let _weighted_bipred = reader.read_bit()?;
    let _transquant_bypass_enabled = reader.read_bit()?;
    let tiles_enabled = reader.read_bit()? != 0;
    let entropy_coding_sync_enabled = reader.read_bit()? != 0;

    if tiles_enabled {
        let num_tile_columns = reader.read_ue()? + 1;
        let num_tile_rows = reader.read_ue()? + 1;
        let uniform_spacing = reader.read_bit()? != 0;
        if !uniform_spacing {
            for _ in 0..num_tile_columns - 1 {
                reader.read_ue()?;
            }
            for _ in 0..num_tile_rows - 1 {
                reader.read_ue()?;
            }
        }
        if tiles_enabled || entropy_coding_sync_enabled {
            reader.read_bit()?; // loop_filter_across_tiles_enabled_flag
        }
    }

    let loop_filter_across_slices_enabled = reader.read_bit()? != 0;
    let deblocking_filter_control_present = reader.read_bit()? != 0;
    let mut deblocking_filter_override_enabled = false;
    let mut deblocking_filter_disabled = false;
    if deblocking_filter_control_present {
        deblocking_filter_override_enabled = reader.read_bit()? != 0;
        deblocking_filter_disabled = reader.read_bit()? != 0;
        if !deblocking_filter_disabled {
            reader.read_se()?; // pps_beta_offset_div2
            reader.read_se()?; // pps_tc_offset_div2
        }
    }

    Ok(HevcPps {
        pps_id,
        sps_id,
        dependent_slice_segments_enabled,
        output_flag_present,
        num_extra_slice_header_bits,
        sign_data_hiding_enabled,
        cabac_init_present,
        num_ref_idx_l0_default,
        num_ref_idx_l1_default,
        init_qp,
        constrained_intra_pred,
        transform_skip_enabled,
        cu_qp_delta_enabled,
        cb_qp_offset,
        cr_qp_offset,
        deblocking_filter_override_enabled,
        deblocking_filter_disabled,
        loop_filter_across_slices_enabled,
        tiles_enabled,
        entropy_coding_sync_enabled,
    })
}

// ---------------------------------------------------------------------------
// Scaling list data parsing (§7.3.4)
// ---------------------------------------------------------------------------

/// Parse and discard scaling_list_data() per HEVC spec §7.3.4.
/// Advances the bitstream position correctly without storing values.
fn skip_scaling_list_data(reader: &mut BitstreamReader) -> Result<(), VideoError> {
    for size_id in 0..4u8 {
        let matrix_count: u8 = if size_id == 3 { 2 } else { 6 };
        let matrix_step: u8 = if size_id == 3 { 3 } else { 1 };
        for matrix_idx in 0..matrix_count {
            let _matrix_id = matrix_idx * matrix_step;
            let pred_mode_flag = reader.read_bit()?;
            if pred_mode_flag == 0 {
                // scaling_list_pred_matrix_id_delta
                reader.read_ue()?;
            } else {
                let coef_num = std::cmp::min(64, 1u32 << (4 + (u32::from(size_id) << 1)));
                if size_id > 1 {
                    // scaling_list_dc_coef_minus8
                    reader.read_se()?;
                }
                for _ in 0..coef_num {
                    // scaling_list_delta_coef
                    reader.read_se()?;
                }
            }
        }
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Extract frame dimensions from HEVC SPS.
pub fn hevc_frame_dimensions(sps: &HevcSps) -> (u32, u32) {
    (sps.pic_width, sps.pic_height)
}

// ---------------------------------------------------------------------------
// Intra prediction modes
// ---------------------------------------------------------------------------

/// HEVC intra prediction mode index.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum HevcIntraMode {
    Planar = 0,
    Dc = 1,
    Angular2 = 2,
    Angular3 = 3,
    Angular4 = 4,
    Angular5 = 5,
    Angular6 = 6,
    Angular7 = 7,
    Angular8 = 8,
    Angular9 = 9,
    Angular10 = 10,
    Angular11 = 11,
    Angular12 = 12,
    Angular13 = 13,
    Angular14 = 14,
    Angular15 = 15,
    Angular16 = 16,
    Angular17 = 17,
    Angular18 = 18,
    Angular19 = 19,
    Angular20 = 20,
    Angular21 = 21,
    Angular22 = 22,
    Angular23 = 23,
    Angular24 = 24,
    Angular25 = 25,
    Angular26 = 26,
    Angular27 = 27,
    Angular28 = 28,
    Angular29 = 29,
    Angular30 = 30,
    Angular31 = 31,
    Angular32 = 32,
    Angular33 = 33,
    Angular34 = 34,
}

impl HevcIntraMode {
    /// Convert from a raw mode index (0..=34).
    pub fn from_index(idx: u8) -> Option<Self> {
        match idx {
            0 => Some(Self::Planar),
            1 => Some(Self::Dc),
            2 => Some(Self::Angular2),
            3 => Some(Self::Angular3),
            4 => Some(Self::Angular4),
            5 => Some(Self::Angular5),
            6 => Some(Self::Angular6),
            7 => Some(Self::Angular7),
            8 => Some(Self::Angular8),
            9 => Some(Self::Angular9),
            10 => Some(Self::Angular10),
            11 => Some(Self::Angular11),
            12 => Some(Self::Angular12),
            13 => Some(Self::Angular13),
            14 => Some(Self::Angular14),
            15 => Some(Self::Angular15),
            16 => Some(Self::Angular16),
            17 => Some(Self::Angular17),
            18 => Some(Self::Angular18),
            19 => Some(Self::Angular19),
            20 => Some(Self::Angular20),
            21 => Some(Self::Angular21),
            22 => Some(Self::Angular22),
            23 => Some(Self::Angular23),
            24 => Some(Self::Angular24),
            25 => Some(Self::Angular25),
            26 => Some(Self::Angular26),
            27 => Some(Self::Angular27),
            28 => Some(Self::Angular28),
            29 => Some(Self::Angular29),
            30 => Some(Self::Angular30),
            31 => Some(Self::Angular31),
            32 => Some(Self::Angular32),
            33 => Some(Self::Angular33),
            34 => Some(Self::Angular34),
            _ => None,
        }
    }
}

/// DC intra prediction: fills block with average of top and left neighbours.
/// NEON/SSE2 accelerated for the fill operation.
#[allow(unsafe_code)]
pub fn intra_predict_dc(top: &[i16], left: &[i16], block_size: usize, out: &mut [i16]) {
    debug_assert!(top.len() >= block_size);
    debug_assert!(left.len() >= block_size);
    debug_assert!(out.len() >= block_size * block_size);
    let sum: i32 = top[..block_size].iter().map(|&v| v as i32).sum::<i32>()
        + left[..block_size].iter().map(|&v| v as i32).sum::<i32>();
    let dc = ((sum + block_size as i32) / (2 * block_size as i32)) as i16;
    let n = block_size * block_size;

    #[cfg(target_arch = "aarch64")]
    {
        let mut i = 0;
        unsafe {
            use std::arch::aarch64::*;
            let dc_vec = vdupq_n_s16(dc);
            while i + 8 <= n {
                vst1q_s16(out.as_mut_ptr().add(i), dc_vec);
                i += 8;
            }
        }
        while i < n {
            out[i] = dc;
            i += 1;
        }
    }

    #[cfg(target_arch = "x86_64")]
    {
        let mut i = 0;
        unsafe {
            use std::arch::x86_64::*;
            let dc_vec = _mm_set1_epi16(dc);
            while i + 8 <= n {
                _mm_storeu_si128(out.as_mut_ptr().add(i) as *mut __m128i, dc_vec);
                i += 8;
            }
        }
        while i < n {
            out[i] = dc;
            i += 1;
        }
    }

    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
    {
        for v in out[..n].iter_mut() {
            *v = dc;
        }
    }
}

/// Planar intra prediction (HEVC mode 0).
pub fn intra_predict_planar(
    top: &[i16],
    left: &[i16],
    top_right: i16,
    bottom_left: i16,
    block_size: usize,
    out: &mut [i16],
) {
    debug_assert!(top.len() >= block_size);
    debug_assert!(left.len() >= block_size);
    debug_assert!(out.len() >= block_size * block_size);
    let n = block_size as i32;
    let shift = (block_size as u32).trailing_zeros() + 1;
    for y in 0..block_size {
        for x in 0..block_size {
            let h = (n - 1 - x as i32) * left[y] as i32 + (x as i32 + 1) * top_right as i32;
            let v = (n - 1 - y as i32) * top[x] as i32 + (y as i32 + 1) * bottom_left as i32;
            out[y * block_size + x] = ((h + v + n) >> shift) as i16;
        }
    }
}

/// Simple angular intra prediction placeholder (modes 2..=34).
/// Uses horizontal or vertical extrapolation depending on mode direction.
/// HEVC angular intra prediction with fractional-sample interpolation.
///
/// ITU-T H.265, section 8.4.4.2.6. Modes 2-34 project through reference
/// samples at angles specified by the `INTRA_PRED_ANGLE` table.
/// Fractional positions use 32-phase linear interpolation.
pub fn intra_predict_angular(
    top: &[i16],
    left: &[i16],
    mode: u8,
    block_size: usize,
    out: &mut [i16],
) {
    debug_assert!((2..=34).contains(&mode));
    debug_assert!(top.len() >= block_size);
    debug_assert!(left.len() >= block_size);
    debug_assert!(out.len() >= block_size * block_size);

    // ITU-T H.265, Table 8-4: intraPredAngle for modes 2..34
    #[rustfmt::skip]
    const INTRA_PRED_ANGLE: [i32; 33] = [
        // modes 2..34 (index 0 = mode 2)
        32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26,
        -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32,
    ];

    let angle = INTRA_PRED_ANGLE[(mode - 2) as usize];
    let is_vertical = mode >= 18; // modes 18-34 are vertical-dominant

    // Build extended reference array from top or left samples
    let n = block_size;
    // Stack buffer instead of Vec (max block_size=64, so 2*64+1=129 entries)
    let mut ref_buf = [128i16; 129];
    let ref_samples = &mut ref_buf[..2 * n + 1];

    if is_vertical {
        // Main reference = top row, side reference = left column
        ref_samples[0] = left[0]; // corner
        ref_samples[1..(n.min(top.len()) + 1)].copy_from_slice(&top[..n.min(top.len())]);
        // Extend with projected left samples for negative angles
        if angle < 0 {
            let inv_angle = ((256 * 32) as f32 / (-angle) as f32).round() as i32;
            let num_ext = (n as i32 * angle) >> 5;
            for k in num_ext..0 {
                let ref_idx = ((-k * inv_angle + 128) >> 8) as usize;
                let dst = k as isize;
                if dst >= -(n as isize) && ref_idx < left.len() {
                    ref_samples[(dst + n as isize) as usize] = left[ref_idx];
                }
            }
        }
        // Project each output sample through the angle
        for y in 0..n {
            let delta = (y as i32 + 1) * angle;
            let idx_offset = delta >> 5;
            let frac = (delta & 31) as i16;
            for x in 0..n {
                let ref_idx = (x as i32 + idx_offset + 1) as usize;
                if frac == 0 {
                    out[y * n + x] = ref_samples.get(ref_idx).copied().unwrap_or(128);
                } else {
                    // 32-phase linear interpolation
                    let a = ref_samples.get(ref_idx).copied().unwrap_or(128) as i32;
                    let b = ref_samples
                        .get(ref_idx.wrapping_add(1))
                        .copied()
                        .unwrap_or(128) as i32;
                    out[y * n + x] = ((32 - frac as i32) * a + frac as i32 * b + 16) as i16 >> 5;
                }
            }
        }
    } else {
        // Horizontal-dominant (modes 2-17): main reference = left column
        ref_samples[0] = top[0]; // corner
        ref_samples[1..(n.min(left.len()) + 1)].copy_from_slice(&left[..n.min(left.len())]);
        if angle < 0 {
            let inv_angle = ((256 * 32) as f32 / (-angle) as f32).round() as i32;
            let num_ext = (n as i32 * angle) >> 5;
            for k in num_ext..0 {
                let ref_idx = ((-k * inv_angle + 128) >> 8) as usize;
                let dst = k as isize;
                if dst >= -(n as isize) && ref_idx < top.len() {
                    ref_samples[(dst + n as isize) as usize] = top[ref_idx];
                }
            }
        }
        // Project — transposed relative to vertical
        for x in 0..n {
            let delta = (x as i32 + 1) * angle;
            let idx_offset = delta >> 5;
            let frac = (delta & 31) as i16;
            for y in 0..n {
                let ref_idx = (y as i32 + idx_offset + 1) as usize;
                if frac == 0 {
                    out[y * n + x] = ref_samples.get(ref_idx).copied().unwrap_or(128);
                } else {
                    let a = ref_samples.get(ref_idx).copied().unwrap_or(128) as i32;
                    let b = ref_samples
                        .get(ref_idx.wrapping_add(1))
                        .copied()
                        .unwrap_or(128) as i32;
                    out[y * n + x] = ((32 - frac as i32) * a + frac as i32 * b + 16) as i16 >> 5;
                }
            }
        }
    }
}

// ---------------------------------------------------------------------------
// Transform / dequantisation
// ---------------------------------------------------------------------------

/// HEVC 4×4 DST-VII core matrix (for intra 4×4 luma TUs).
const DST4_MATRIX: [[i32; 4]; 4] = [
    [29, 55, 74, 84],
    [74, 74, 0, -74],
    [84, -29, -74, 55],
    [55, -84, 74, -29],
];

/// HEVC 4×4 DCT-II core matrix.
const DCT4_MATRIX: [[i32; 4]; 4] = [
    [64, 64, 64, 64],
    [83, 36, -36, -83],
    [64, -64, -64, 64],
    [36, -83, 83, -36],
];

/// HEVC 8×8 DCT-II core matrix.
const DCT8_MATRIX: [[i32; 8]; 8] = [
    [64, 64, 64, 64, 64, 64, 64, 64],
    [89, 75, 50, 18, -18, -50, -75, -89],
    [83, 36, -36, -83, -83, -36, 36, 83],
    [75, -18, -89, -50, 50, 89, 18, -75],
    [64, -64, -64, 64, 64, -64, -64, 64],
    [50, -89, 18, 75, -75, -18, 89, -50],
    [36, -83, 83, -36, -36, 83, -83, 36],
    [18, -50, 75, -89, 89, -75, 50, -18],
];

/// Inverse 4×4 DST (HEVC, for intra 4×4 luma).
pub fn hevc_inverse_dst_4x4(coeffs: &[i32; 16], out: &mut [i32; 16]) {
    // 1-D inverse DST on rows
    let mut tmp = [0i32; 16];
    for i in 0..4 {
        for j in 0..4 {
            let mut sum = 0i32;
            for k in 0..4 {
                sum += DST4_MATRIX[k][j] * coeffs[i * 4 + k];
            }
            tmp[i * 4 + j] = (sum + 64) >> 7;
        }
    }
    // 1-D inverse DST on columns
    for j in 0..4 {
        for i in 0..4 {
            let mut sum = 0i32;
            for k in 0..4 {
                sum += DST4_MATRIX[k][i] * tmp[k * 4 + j];
            }
            out[i * 4 + j] = (sum + 2048) >> 12;
        }
    }
}

/// Inverse 4×4 DCT-II (HEVC).
pub fn hevc_inverse_dct_4x4(coeffs: &[i32; 16], out: &mut [i32; 16]) {
    let mut tmp = [0i32; 16];
    for i in 0..4 {
        for j in 0..4 {
            let mut sum = 0i32;
            for k in 0..4 {
                sum += DCT4_MATRIX[k][j] * coeffs[i * 4 + k];
            }
            tmp[i * 4 + j] = (sum + 64) >> 7;
        }
    }
    for j in 0..4 {
        for i in 0..4 {
            let mut sum = 0i32;
            for k in 0..4 {
                sum += DCT4_MATRIX[k][i] * tmp[k * 4 + j];
            }
            out[i * 4 + j] = (sum + 2048) >> 12;
        }
    }
}

/// Inverse 8×8 DCT-II (HEVC).
pub fn hevc_inverse_dct_8x8(coeffs: &[i32; 64], out: &mut [i32; 64]) {
    let mut tmp = [0i32; 64];
    for i in 0..8 {
        for j in 0..8 {
            let mut sum = 0i32;
            for k in 0..8 {
                sum += DCT8_MATRIX[k][j] * coeffs[i * 8 + k];
            }
            tmp[i * 8 + j] = (sum + 64) >> 7;
        }
    }
    for j in 0..8 {
        for i in 0..8 {
            let mut sum = 0i32;
            for k in 0..8 {
                sum += DCT8_MATRIX[k][i] * tmp[k * 8 + j];
            }
            out[i * 8 + j] = (sum + 2048) >> 12;
        }
    }
}

/// Generic inverse DCT for 16×16 blocks (partial butterfly, simplified).
pub fn hevc_inverse_dct_16x16(coeffs: &[i32; 256], out: &mut [i32; 256]) {
    // Direct matrix multiply using HEVC 16-point DCT-II core.
    static HEVC_DCT16: [[i32; 16]; 16] = [
        [
            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        ],
        [
            90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
        ],
        [
            89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
        ],
        [
            87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
        ],
        [
            83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
        ],
        [
            80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
        ],
        [
            75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
        ],
        [
            70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
        ],
        [
            64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
        ],
        [
            57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
        ],
        [
            50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
        ],
        [
            43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
        ],
        [
            36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
        ],
        [
            25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
        ],
        [
            18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
        ],
        [
            9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
        ],
    ];
    let mut tmp = [0i32; 256];
    for i in 0..16 {
        for j in 0..16 {
            let mut sum = 0i32;
            for k in 0..16 {
                sum += HEVC_DCT16[k][j] * coeffs[i * 16 + k];
            }
            tmp[i * 16 + j] = (sum + 64) >> 7;
        }
    }
    for j in 0..16 {
        for i in 0..16 {
            let mut sum = 0i32;
            for k in 0..16 {
                sum += HEVC_DCT16[k][i] * tmp[k * 16 + j];
            }
            out[i * 16 + j] = (sum + 2048) >> 12;
        }
    }
}

/// Generic inverse DCT for 32×32 blocks (direct matrix multiply, simplified).
pub fn hevc_inverse_dct_32x32(coeffs: &[i32; 1024], out: &mut [i32; 1024]) {
    // HEVC 32-point DCT-II core matrix.
    static HEVC_DCT32: [[i32; 32]; 32] = hevc_dct32_matrix();
    let mut tmp = [0i32; 1024];
    for i in 0..32 {
        for j in 0..32 {
            let mut sum = 0i64;
            for k in 0..32 {
                sum += HEVC_DCT32[k][j] as i64 * coeffs[i * 32 + k] as i64;
            }
            tmp[i * 32 + j] = ((sum + 64) >> 7) as i32;
        }
    }
    for j in 0..32 {
        for i in 0..32 {
            let mut sum = 0i64;
            for k in 0..32 {
                sum += HEVC_DCT32[k][i] as i64 * tmp[k * 32 + j] as i64;
            }
            out[i * 32 + j] = ((sum + 2048) >> 12) as i32;
        }
    }
}

/// Build the HEVC 32-point DCT-II transform matrix at compile time.
const fn hevc_dct32_matrix() -> [[i32; 32]; 32] {
    // Even rows (0,2,4,...,30) come from 16-point matrix expanded to 32 columns.
    // Odd rows (1,3,5,...,31) are the 32-point odd basis from HEVC spec Table 8-7.
    let even16: [[i32; 16]; 16] = [
        [
            64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
        ],
        [
            90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
        ],
        [
            89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
        ],
        [
            87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
        ],
        [
            83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
        ],
        [
            80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
        ],
        [
            75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
        ],
        [
            70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
        ],
        [
            64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
        ],
        [
            57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
        ],
        [
            50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
        ],
        [
            43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
        ],
        [
            36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
        ],
        [
            25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
        ],
        [
            18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
        ],
        [
            9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
        ],
    ];
    let odd_rows: [[i32; 32]; 16] = [
        [
            90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38,
            -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90,
        ],
        [
            90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61,
            78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90,
        ],
        [
            88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85,
            -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88,
        ],
        [
            85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90,
            61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85,
        ],
        [
            82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4,
            67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82,
        ],
        [
            78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38,
            -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78,
        ],
        [
            73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4,
            88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73,
        ],
        [
            67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46,
            -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67,
        ],
        [
            61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78,
            38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61,
        ],
        [
            54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90,
            22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54,
        ],
        [
            46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73,
            -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46,
        ],
        [
            38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54,
            90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38,
        ],
        [
            31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13,
            -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31,
        ],
        [
            22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31,
            13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22,
        ],
        [
            13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46,
            -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13,
        ],
        [
            4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88,
            -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4,
        ],
    ];
    // Expand the 16-point even basis into 32-column rows
    let even_rows_full: [[i32; 32]; 16] = expand_even_rows(&even16);
    // Assemble: even rows at indices 0,2,4,...,30; odd rows at 1,3,5,...,31
    let mut m = [[0i32; 32]; 32];
    let mut row = 0;
    while row < 16 {
        m[row * 2] = even_rows_full[row];
        m[row * 2 + 1] = odd_rows[row];
        row += 1;
    }
    m
}

/// Expand 16-point even basis into 32 columns (DCT decomposition).
const fn expand_even_rows(even16: &[[i32; 16]; 16]) -> [[i32; 32]; 16] {
    let mut out = [[0i32; 32]; 16];
    let mut r = 0;
    while r < 16 {
        let mut c = 0;
        while c < 32 {
            // For even rows of the 32-pt DCT, the values at column n equal
            // the 16-pt DCT values at column n for column < 16 and mirrored for >= 16.
            // T_even[k][n] = T16[k][n] for n=0..15 and sign-symmetric for n=16..31
            // Actually: T_32_even[k][n] = T_16[k][n/2] when n is even;
            // But the correct relationship is: the even rows of a 2N-pt DCT
            // are the N-pt DCT applied to (x[n]+x[2N-1-n]).
            // For the transform matrix this means T_2N[2k][n] has symmetry.
            //
            // The correct values for even rows can be read from Table 8-7 directly.
            // For simplicity in this const fn, we compute them from the 16-pt matrix:
            // T_32[2k][n] = T_16[k][n] for n=0..15
            // T_32[2k][31-n] = T_16[k][n] * sign where sign alternates by k
            if c < 16 {
                out[r][c] = even16[r][c];
            } else {
                let mirror = 31 - c;
                // For even k: symmetric; for odd k: antisymmetric
                if r % 2 == 0 {
                    out[r][c] = even16[r][mirror];
                } else {
                    out[r][c] = -even16[r][mirror];
                }
            }
            c += 1;
        }
        r += 1;
    }
    out
}

/// HEVC dequantisation for a transform block.
/// Applies `level * scale >> shift` per coefficient.
pub fn hevc_dequant(coeffs: &mut [i32], qp: i32, bit_depth: u8, log2_transform_size: u8) {
    // HEVC dequant: coeff * (level_scale[qp%6] << (qp/6)) >> shift
    const LEVEL_SCALE: [i32; 6] = [40, 45, 51, 57, 64, 72];
    let qp = qp.max(0) as u32;
    let scale = LEVEL_SCALE[(qp % 6) as usize];
    let shift_base = qp / 6;
    let bd_offset = (bit_depth as u32).saturating_sub(8);
    // transform_shift = max_log2_dynamic_range - bit_depth - log2_transform_size
    // For 8-bit: max_log2_dynamic_range = 15
    let max_log2 = 15 + bd_offset;
    let transform_shift = max_log2 as i32 - bit_depth as i32 - log2_transform_size as i32;
    let total_shift = shift_base as i32 + transform_shift;
    if total_shift >= 0 {
        let offset = if total_shift > 0 {
            1 << (total_shift - 1)
        } else {
            0
        };
        for c in coeffs.iter_mut() {
            *c = (*c * scale + offset) >> total_shift;
        }
    } else {
        let left_shift = (-total_shift) as u32;
        for c in coeffs.iter_mut() {
            *c = *c * scale * (1 << left_shift);
        }
    }
}

// ---------------------------------------------------------------------------
// Coding Tree Unit decode framework
// ---------------------------------------------------------------------------

/// Prediction mode for a coding unit.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HevcPredMode {
    Intra,
    Inter,
    Skip,
}

/// Result of decoding one coding unit leaf.
#[derive(Debug, Clone)]
pub struct DecodedCu {
    pub x: usize,
    pub y: usize,
    pub size: usize,
    pub pred_mode: HevcPredMode,
}

/// Recursively decode a coding tree (quad-tree split).
///
/// `depth` starts at 0 for the CTU root. `max_depth` is derived from SPS
/// (log2_diff_max_min_cb_size). When `depth == max_depth` or the split flag
/// is 0, the node is a leaf CU.
///
/// This is a framework function; actual CABAC parsing is not implemented.
/// Instead, it treats each leaf CU as intra-DC predicted with zero residual.
pub fn decode_coding_tree(
    x: usize,
    y: usize,
    log2_cu_size: u8,
    depth: u8,
    max_depth: u8,
    qp: i8,
    pic_width: usize,
    pic_height: usize,
    results: &mut Vec<DecodedCu>,
) {
    let cu_size = 1usize << log2_cu_size;
    let _ = qp; // will be used once CABAC residual decoding is added

    // If outside picture bounds, skip
    if x >= pic_width || y >= pic_height {
        return;
    }

    // Decide whether to split (framework: split until min CU size)
    let should_split = depth < max_depth && cu_size > 8;

    if should_split {
        let half = log2_cu_size - 1;
        let half_size = 1usize << half;
        let next_depth = depth + 1;
        decode_coding_tree(
            x, y, half, next_depth, max_depth, qp, pic_width, pic_height, results,
        );
        decode_coding_tree(
            x + half_size,
            y,
            half,
            next_depth,
            max_depth,
            qp,
            pic_width,
            pic_height,
            results,
        );
        decode_coding_tree(
            x,
            y + half_size,
            half,
            next_depth,
            max_depth,
            qp,
            pic_width,
            pic_height,
            results,
        );
        decode_coding_tree(
            x + half_size,
            y + half_size,
            half,
            next_depth,
            max_depth,
            qp,
            pic_width,
            pic_height,
            results,
        );
    } else {
        // Leaf CU — apply intra DC prediction with zero residual
        let actual_w = cu_size.min(pic_width.saturating_sub(x));
        let actual_h = cu_size.min(pic_height.saturating_sub(y));
        let dc_val = 1i16 << 7; // 128 for 8-bit
        let _recon_val = dc_val;
        let _ = (actual_w, actual_h); // stub doesn't fill recon_luma

        results.push(DecodedCu {
            x,
            y,
            size: cu_size,
            pred_mode: HevcPredMode::Intra,
        });
    }
}

// ---------------------------------------------------------------------------
// HEVC Decoder
// ---------------------------------------------------------------------------

/// Top-level HEVC decoder state.
pub struct HevcDecoder {
    vps: Option<HevcVps>,
    sps: Option<HevcSps>,
    pps: Option<HevcPps>,
    /// Decoded Picture Buffer for inter prediction reference frames.
    dpb: super::hevc_inter::HevcDpb,
    /// Picture Order Count counter.
    poc: i32,
    /// Reusable reconstruction buffer (avoids per-frame allocation).
    recon_buf: Vec<i16>,
    /// Reusable MV field buffer (avoids 1.6MB alloc per frame).
    mv_field_buf: Vec<super::hevc_inter::HevcMvField>,
    /// Reusable CU list buffer.
    cu_buf: Vec<DecodedCu>,
    /// Reusable Y plane output buffer.
    y_plane_buf: Vec<u8>,
    /// Reusable Cb chroma reconstruction buffer (avoids per-frame allocation).
    recon_cb: Vec<i16>,
    /// Reusable Cr chroma reconstruction buffer (avoids per-frame allocation).
    recon_cr: Vec<i16>,
    /// Skip RGB conversion for benchmark mode.
    pub skip_rgb: bool,
}

impl HevcDecoder {
    /// Create a new decoder with no parameter sets.
    pub fn new() -> Self {
        Self {
            vps: None,
            sps: None,
            pps: None,
            dpb: super::hevc_inter::HevcDpb::new(16),
            mv_field_buf: Vec::new(),
            cu_buf: Vec::new(),
            y_plane_buf: Vec::new(),
            recon_cb: Vec::new(),
            recon_cr: Vec::new(),
            skip_rgb: false,
            poc: 0,
            recon_buf: Vec::new(),
        }
    }

    /// Convert i16 buffer to u8 with saturation (NEON-accelerated on aarch64).
    #[allow(unsafe_code)]
    fn i16_to_u8_clamp(src: &[i16], dst: &mut [u8]) {
        debug_assert!(dst.len() >= src.len());
        let len = src.len();

        #[cfg(target_arch = "aarch64")]
        {
            let mut i = 0;
            while i + 8 <= len {
                unsafe {
                    use std::arch::aarch64::*;
                    let v = vld1q_s16(src.as_ptr().add(i));
                    let clamped = vqmovun_s16(v);
                    std::ptr::copy_nonoverlapping(
                        &clamped as *const uint8x8_t as *const u8,
                        dst.as_mut_ptr().add(i),
                        8,
                    );
                }
                i += 8;
            }
            while i < len {
                dst[i] = src[i].clamp(0, 255) as u8;
                i += 1;
            }
        }

        #[cfg(target_arch = "x86_64")]
        {
            let mut i = 0;
            while i + 8 <= len {
                unsafe {
                    use std::arch::x86_64::*;
                    let v = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
                    let clamped = _mm_packus_epi16(v, v);
                    std::ptr::copy_nonoverlapping(
                        &clamped as *const __m128i as *const u8,
                        dst.as_mut_ptr().add(i),
                        8,
                    );
                }
                i += 8;
            }
            while i < len {
                dst[i] = src[i].clamp(0, 255) as u8;
                i += 1;
            }
        }

        #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
        {
            for i in 0..len {
                dst[i] = src[i].clamp(0, 255) as u8;
            }
        }
    }

    /// Current SPS, if any.
    pub fn sps(&self) -> Option<&HevcSps> {
        self.sps.as_ref()
    }

    /// Current PPS, if any.
    pub fn pps(&self) -> Option<&HevcPps> {
        self.pps.as_ref()
    }

    /// Decode a single NAL unit (payload after start code, including the 2-byte header).
    ///
    /// Returns `Some(DecodedFrame)` when a complete picture is produced (IDR / CRA),
    /// or `None` for parameter-set and other non-VCL NALs.
    pub fn decode_nal(
        &mut self,
        nal_data: &[u8],
    ) -> Result<Option<crate::DecodedFrame>, VideoError> {
        use crate::HevcNalUnitType;

        if nal_data.len() < 2 {
            return Err(VideoError::Codec("NAL unit too short".into()));
        }
        let nal_type = HevcNalUnitType::from_header(nal_data);
        let payload = &nal_data[2..]; // skip 2-byte NAL header

        match nal_type {
            HevcNalUnitType::VpsNut => {
                self.vps = Some(parse_hevc_vps(payload)?);
                Ok(None)
            }
            HevcNalUnitType::SpsNut => {
                self.sps = Some(parse_hevc_sps(payload)?);
                Ok(None)
            }
            HevcNalUnitType::PpsNut => {
                self.pps = Some(parse_hevc_pps(payload)?);
                Ok(None)
            }
            HevcNalUnitType::IdrWRadl | HevcNalUnitType::IdrNLp => {
                self.dpb.clear();
                self.poc = 0;
                match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                    self.decode_picture(payload, true)
                })) {
                    Ok(r) => r,
                    Err(_) => Err(VideoError::Codec("HEVC decode panicked".into())),
                }
            }
            HevcNalUnitType::CraNut => {
                match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                    self.decode_picture(payload, true)
                })) {
                    Ok(r) => r,
                    Err(_) => Err(VideoError::Codec("HEVC decode panicked".into())),
                }
            }
            // P/B slice NAL types (trailing, TSA, STSA, RADL, RASL)
            HevcNalUnitType::TrailN
            | HevcNalUnitType::TrailR
            | HevcNalUnitType::TsaN
            | HevcNalUnitType::TsaR
            | HevcNalUnitType::StsaN
            | HevcNalUnitType::StsaR
            | HevcNalUnitType::RadlN
            | HevcNalUnitType::RadlR
            | HevcNalUnitType::RaslN
            | HevcNalUnitType::RaslR => {
                match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                    self.decode_picture(payload, false)
                })) {
                    Ok(r) => r,
                    Err(_) => Err(VideoError::Codec("HEVC decode panicked".into())),
                }
            }
            _ => Ok(None), // non-VCL or unsupported VCL
        }
    }

    /// Decode a picture from a slice NAL payload.
    ///
    /// When the payload has enough data, uses CABAC-driven coding tree
    /// decoding via [`super::hevc_syntax::decode_coding_tree_cabac`].
    /// Falls back to the stub (DC fill) path when the payload is too short
    /// to bootstrap the CABAC decoder or when no SPS/PPS is available.
    fn decode_picture(
        &mut self,
        payload: &[u8],
        is_keyframe: bool,
    ) -> Result<Option<crate::DecodedFrame>, VideoError> {
        let sps = self
            .sps
            .as_ref()
            .ok_or_else(|| VideoError::Codec("Slice received before SPS".into()))?;
        let pps = self
            .pps
            .as_ref()
            .ok_or_else(|| VideoError::Codec("Slice received before PPS".into()))?;

        let w = sps.pic_width as usize;
        let h = sps.pic_height as usize;
        if w == 0 || h == 0 || w > 8192 || h > 8192 {
            return Err(VideoError::Codec(format!(
                "HEVC frame dimensions out of range: {w}x{h}"
            )));
        }
        let ctu_size_log2 = sps.log2_min_cb_size + sps.log2_diff_max_min_cb_size;
        if !(3..=7).contains(&ctu_size_log2) {
            return Err(VideoError::Codec(format!(
                "HEVC CTU size log2 out of range: {ctu_size_log2}"
            )));
        }
        let ctu_size = 1usize << ctu_size_log2;
        let max_depth = sps.log2_diff_max_min_cb_size.min(5);

        // Parse minimal slice header to determine slice type.
        // HEVC slice_header(): first_slice_segment_in_pic_flag(1), then if
        // IDR/BLA/CRA the no_output_of_prior_pics_flag(1), then pps_id (ue),
        // then if !first_slice: slice_segment_address, then
        // num_extra_slice_header_bits skip bits, then slice_type (ue).
        let slice_type = if payload.len() >= 3 {
            let rbsp = super::h264_params::remove_emulation_prevention(payload);
            let mut sh_reader = crate::BitstreamReader::new(&rbsp);
            let first_slice = sh_reader.read_bit().unwrap_or(1) == 1;
            if is_keyframe {
                let _ = sh_reader.read_bit(); // no_output_of_prior_pics_flag
            }
            let _pps_id = sh_reader.read_ue().unwrap_or(0);
            if !first_slice {
                // slice_segment_address — need to know how many bits, skip approximation
                let bits_needed = if w > 0 && h > 0 {
                    let ctb_count = w.div_ceil(ctu_size) * h.div_ceil(ctu_size);
                    (32 - (ctb_count as u32).leading_zeros()).max(1) as u8
                } else {
                    1
                };
                let _ = sh_reader.read_bits(bits_needed);
            }
            // Skip num_extra_slice_header_bits
            for _ in 0..pps.num_extra_slice_header_bits {
                let _ = sh_reader.read_bit();
            }
            let st_val = sh_reader.read_ue().unwrap_or(2);
            match st_val {
                0 => HevcSliceType::B,
                1 => HevcSliceType::P,
                _ => HevcSliceType::I,
            }
        } else {
            HevcSliceType::I
        };

        // Use CABAC path when we have a meaningful payload, otherwise fall
        // back to the deterministic stub (useful for unit tests that build
        // an HevcDecoder with synthetic parameter sets but no real slice
        // data).
        let use_cabac = payload.len() >= 2;

        // Reuse CU list buffer across frames
        let est_cus = (w * h) / (8 * 8);
        self.cu_buf.clear();
        self.cu_buf
            .reserve(est_cus.saturating_sub(self.cu_buf.capacity()));
        let mut cus = std::mem::take(&mut self.cu_buf);
        let est_ctus = w.div_ceil(64) * h.div_ceil(64);
        let mut sao_list: Vec<super::hevc_filter::SaoParams> = Vec::with_capacity(est_ctus);

        // Chroma recon buffers — reuse across frames to avoid per-frame allocation
        let chroma_needed = (w / 2) * (h / 2);
        self.recon_cb.resize(chroma_needed, 128);
        self.recon_cb.fill(128);
        let mut recon_cb = std::mem::take(&mut self.recon_cb);
        self.recon_cr.resize(chroma_needed, 128);
        self.recon_cr.fill(128);
        let mut recon_cr = std::mem::take(&mut self.recon_cr);

        if use_cabac {
            let slice_qp = pps.init_qp as i32;
            let mut cabac_state = super::hevc_syntax::HevcSliceCabacState::new(payload, slice_qp);
            let mid_val = 1i16 << (sps.bit_depth_luma - 1);
            // Reuse reconstruction buffer across frames (avoid 1.8MB alloc per frame)
            let needed = w * h;
            if self.recon_buf.len() < needed {
                self.recon_buf.resize(needed, mid_val);
            } else {
                self.recon_buf[..needed].fill(mid_val);
            }
            let mut recon_luma = std::mem::take(&mut self.recon_buf);
            recon_luma.truncate(needed);
            let min_pu = 4usize;
            let pic_w_pu = w.div_ceil(min_pu);
            let pic_h_pu = h.div_ceil(min_pu);
            // Reuse MV field buffer across frames
            let mv_needed = pic_w_pu * pic_h_pu;
            if self.mv_field_buf.len() < mv_needed {
                self.mv_field_buf
                    .resize(mv_needed, super::hevc_inter::HevcMvField::unavailable());
            } else {
                for v in self.mv_field_buf[..mv_needed].iter_mut() {
                    *v = super::hevc_inter::HevcMvField::unavailable();
                }
            }
            let mut mv_field = std::mem::take(&mut self.mv_field_buf);

            let mut ctu_y = 0;
            while ctu_y < h {
                let mut ctu_x = 0;
                while ctu_x < w {
                    if cabac_state.cabac.bytes_remaining() < 1 {
                        break;
                    }
                    // Parse SAO parameters per CTU when enabled in SPS
                    if sps.sample_adaptive_offset_enabled {
                        let left_avail = ctu_x > 0;
                        let above_avail = ctu_y > 0;
                        let sao = super::hevc_filter::parse_sao_params(
                            &mut cabac_state.cabac,
                            left_avail,
                            above_avail,
                        );
                        sao_list.push(sao);
                    }

                    super::hevc_syntax::decode_coding_tree_cabac(
                        &mut cabac_state,
                        ctu_x,
                        ctu_y,
                        ctu_size_log2,
                        0,
                        max_depth,
                        sps,
                        pps,
                        slice_type,
                        w,
                        h,
                        &mut recon_luma,
                        &mut recon_cb,
                        &mut recon_cr,
                        &mut cus,
                        &self.dpb,
                        &mut mv_field,
                    );
                    ctu_x += ctu_size;
                }
                ctu_y += ctu_size;
            }
            // Restore reusable buffers
            self.recon_buf = recon_luma;
            self.mv_field_buf = mv_field;
        } else {
            // Stub path: split down to leaves and fill with DC 128
            let mut ctu_y = 0;
            while ctu_y < h {
                let mut ctu_x = 0;
                while ctu_x < w {
                    decode_coding_tree(
                        ctu_x,
                        ctu_y,
                        ctu_size_log2,
                        0,
                        max_depth,
                        pps.init_qp,
                        w,
                        h,
                        &mut cus,
                    );
                    ctu_x += ctu_size;
                }
                ctu_y += ctu_size;
            }
        }

        // Convert recon_luma (i16) → y_plane (u8) in one pass
        let bit_shift = sps.bit_depth_luma.saturating_sub(8);
        let pixel_count = w * h;
        // Reuse y_plane buffer across frames
        if self.y_plane_buf.len() < pixel_count {
            self.y_plane_buf.resize(pixel_count, 0);
        }
        let mut y_plane = std::mem::take(&mut self.y_plane_buf);
        if self.recon_buf.len() >= pixel_count {
            if bit_shift == 0 {
                Self::i16_to_u8_clamp(&self.recon_buf[..pixel_count], &mut y_plane);
            } else {
                let max_val = (1i16 << sps.bit_depth_luma) - 1;
                for (dst, &src) in y_plane.iter_mut().zip(self.recon_buf.iter()) {
                    *dst = (src.clamp(0, max_val) >> bit_shift) as u8;
                }
            }
        } else {
            y_plane.fill(128);
        }

        // Build CU info for deblocking (just metadata, no pixel data)
        let cu_info: Vec<(usize, usize, usize, HevcPredMode)> = cus
            .iter()
            .map(|cu| (cu.x, cu.y, cu.size, cu.pred_mode))
            .collect();

        // Finalize: deblocking, SAO, optional RGB conversion
        let slice_qp = pps.init_qp.unsigned_abs();
        let sao_ref = if sao_list.is_empty() {
            None
        } else {
            Some(sao_list.as_slice())
        };

        let rgb = if self.skip_rgb {
            // Luma-only mode: skip deblock+SAO+RGB entirely (pure decode benchmark)
            // Still produce minimal output for frame counting
            vec![128u8; 3] // 1-pixel placeholder
        } else {
            // Convert chroma recon i16 → u8
            let chroma_n = (w / 2) * (h / 2);
            let cb_plane: Vec<u8> = recon_cb[..chroma_n]
                .iter()
                .map(|&v| v.clamp(0, 255) as u8)
                .collect();
            let cr_plane: Vec<u8> = recon_cr[..chroma_n]
                .iter()
                .map(|&v| v.clamp(0, 255) as u8)
                .collect();
            super::hevc_filter::finalize_hevc_frame_with_chroma(
                &mut y_plane,
                &cb_plane,
                &cr_plane,
                w,
                h,
                &cu_info,
                slice_qp,
                sao_ref,
            )
        };

        // Store reconstructed luma in DPB as u16
        let pixel_count = w * h;
        let max_val = (1i16 << sps.bit_depth_luma) - 1;
        let dpb_luma: Vec<u16> = self.recon_buf[..pixel_count]
            .iter()
            .map(|&s| s.clamp(0, max_val) as u16)
            .collect();
        // Store chroma in DPB for inter prediction
        let chroma_n = (w / 2) * (h / 2);
        let dpb_cb: Vec<u16> = recon_cb[..chroma_n]
            .iter()
            .map(|&v| v.clamp(0, 255) as u16)
            .collect();
        let dpb_cr: Vec<u16> = recon_cr[..chroma_n]
            .iter()
            .map(|&v| v.clamp(0, 255) as u16)
            .collect();
        self.dpb.add(super::hevc_inter::HevcReferencePicture {
            poc: self.poc,
            luma: dpb_luma,
            cb: dpb_cb,
            cr: dpb_cr,
            width: w,
            height: h,
            is_long_term: false,
        });
        self.poc += 1;
        self.y_plane_buf = y_plane;
        self.cu_buf = cus;
        self.recon_cb = recon_cb;
        self.recon_cr = recon_cr;

        Ok(Some(crate::DecodedFrame {
            width: w,
            height: h,
            rgb8_data: rgb,
            timestamp_us: 0,
            keyframe: is_keyframe,
        }))
    }
}

impl Default for HevcDecoder {
    fn default() -> Self {
        Self::new()
    }
}

impl crate::VideoDecoder for HevcDecoder {
    fn codec(&self) -> crate::VideoCodec {
        crate::VideoCodec::H265
    }

    fn decode(
        &mut self,
        data: &[u8],
        timestamp_us: u64,
    ) -> Result<Option<crate::DecodedFrame>, crate::VideoError> {
        // HEVC NAL units from Annex B stream
        let nals = parse_hevc_annex_b(data);
        let mut last_frame = None;
        for nal_data in &nals {
            if let Some(mut frame) = self.decode_nal(nal_data)? {
                frame.timestamp_us = timestamp_us;
                last_frame = Some(frame);
            }
        }
        Ok(last_frame)
    }

    fn flush(&mut self) -> Result<Vec<crate::DecodedFrame>, crate::VideoError> {
        Ok(Vec::new())
    }
}

/// Parse HEVC Annex B NAL units (2-byte NAL headers).
/// Splits on 0x000001 / 0x00000001 start codes, returns raw NAL data including header.
pub fn parse_hevc_annex_b(data: &[u8]) -> Vec<Vec<u8>> {
    let mut units = Vec::new();
    let mut i = 0;
    let len = data.len();

    while i < len {
        if i + 3 <= len && data[i] == 0 && data[i + 1] == 0 {
            let (sc_len, found) = if i + 4 <= len && data[i + 2] == 0 && data[i + 3] == 1 {
                (4, true)
            } else if data[i + 2] == 1 {
                (3, true)
            } else {
                (0, false)
            };

            if found {
                let nal_start = i + sc_len;
                let mut nal_end = len;
                let mut j = nal_start;
                while j < len {
                    if j + 3 <= len
                        && data[j] == 0
                        && data[j + 1] == 0
                        && ((j + 4 <= len && data[j + 2] == 0 && data[j + 3] == 1)
                            || data[j + 2] == 1)
                    {
                        nal_end = j;
                        break;
                    }
                    j += 1;
                }
                if nal_start < nal_end {
                    units.push(data[nal_start..nal_end].to_vec());
                }
                i = nal_end;
                continue;
            }
        }
        i += 1;
    }
    units
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::BitstreamReader;

    // -- Test helpers (same convention as h264_decoder.rs) -------------------

    fn push_bits(bits: &mut Vec<u8>, value: u32, count: u8) {
        for i in (0..count).rev() {
            bits.push(((value >> i) & 1) as u8);
        }
    }

    fn push_exp_golomb(bits: &mut Vec<u8>, value: u32) {
        if value == 0 {
            bits.push(1);
            return;
        }
        let code = value + 1;
        let bit_len = 32 - code.leading_zeros();
        let leading_zeros = bit_len - 1;
        for _ in 0..leading_zeros {
            bits.push(0);
        }
        for i in (0..bit_len).rev() {
            bits.push(((code >> i) & 1) as u8);
        }
    }

    fn push_signed_exp_golomb(bits: &mut Vec<u8>, value: i32) {
        let code = if value <= 0 {
            (-value * 2) as u32
        } else {
            (value * 2 - 1) as u32
        };
        push_exp_golomb(bits, code);
    }

    fn bits_to_bytes(bits: &[u8]) -> Vec<u8> {
        let mut bytes = Vec::new();
        for chunk in bits.chunks(8) {
            let mut byte = 0u8;
            for (i, &bit) in chunk.iter().enumerate() {
                byte |= bit << (7 - i);
            }
            bytes.push(byte);
        }
        bytes
    }

    #[test]
    fn hevc_vps_parse() {
        let mut bits = Vec::new();
        // vps_id = 2 (4 bits)
        push_bits(&mut bits, 2, 4);
        // reserved 2 bits (vps_base_layer flags)
        push_bits(&mut bits, 0, 2);
        // max_layers_minus1 = 0 (6 bits) => max_layers = 1
        push_bits(&mut bits, 0, 6);
        // max_sub_layers_minus1 = 2 (3 bits) => max_sub_layers = 3
        push_bits(&mut bits, 2, 3);
        // temporal_id_nesting = 1 (1 bit)
        push_bits(&mut bits, 1, 1);

        let bytes = bits_to_bytes(&bits);
        let vps = parse_hevc_vps(&bytes).unwrap();
        assert_eq!(vps.vps_id, 2);
        assert_eq!(vps.max_layers, 1);
        assert_eq!(vps.max_sub_layers, 3);
        assert!(vps.temporal_id_nesting);
    }

    #[test]
    fn hevc_sps_dimensions() {
        // Build a minimal SPS bitstream for 1920x1080, 1 sub-layer, chroma 4:2:0.
        let mut bits = Vec::new();

        // vps_id = 0 (4 bits)
        push_bits(&mut bits, 0, 4);
        // max_sub_layers_minus1 = 0 (3 bits) => max_sub_layers = 1
        push_bits(&mut bits, 0, 3);
        // temporal_id_nesting_flag = 1
        push_bits(&mut bits, 1, 1);

        // profile_tier_level for max_sub_layers=1:
        // general_profile_space(2) + general_tier_flag(1) + general_profile_idc(5) = 8 bits
        push_bits(&mut bits, 0, 8);
        // general_profile_compatibility_flags (32 bits)
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        // general_constraint_indicator_flags (48 bits)
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        // general_level_idc (8 bits)
        push_bits(&mut bits, 0, 8);
        // no sub_layer flags when max_sub_layers == 1

        // sps_id = ue(0)
        push_exp_golomb(&mut bits, 0);
        // chroma_format_idc = ue(1) => 4:2:0
        push_exp_golomb(&mut bits, 1);
        // pic_width = ue(1920)
        push_exp_golomb(&mut bits, 1920);
        // pic_height = ue(1080)
        push_exp_golomb(&mut bits, 1080);
        // conformance_window_flag = 0
        push_bits(&mut bits, 0, 1);
        // bit_depth_luma_minus8 = ue(0) => 8
        push_exp_golomb(&mut bits, 0);
        // bit_depth_chroma_minus8 = ue(0) => 8
        push_exp_golomb(&mut bits, 0);
        // log2_max_pic_order_cnt_lsb_minus4 = ue(0) => 4
        push_exp_golomb(&mut bits, 0);
        // sub_layer_ordering_info_present_flag = 1
        push_bits(&mut bits, 1, 1);
        // For 1 sub-layer: max_dec_pic_buffering_minus1, max_num_reorder_pics, max_latency_increase_plus1
        push_exp_golomb(&mut bits, 0);
        push_exp_golomb(&mut bits, 0);
        push_exp_golomb(&mut bits, 0);
        // log2_min_luma_coding_block_size_minus3 = ue(0) => 3
        push_exp_golomb(&mut bits, 0);
        // log2_diff_max_min_luma_coding_block_size = ue(3)
        push_exp_golomb(&mut bits, 3);
        // log2_min_luma_transform_block_size_minus2 = ue(0) => 2
        push_exp_golomb(&mut bits, 0);
        // log2_diff_max_min_luma_transform_block_size = ue(3)
        push_exp_golomb(&mut bits, 3);
        // max_transform_hierarchy_depth_inter = ue(0)
        push_exp_golomb(&mut bits, 0);
        // max_transform_hierarchy_depth_intra = ue(0)
        push_exp_golomb(&mut bits, 0);
        // scaling_list_enabled_flag = 0
        push_bits(&mut bits, 0, 1);
        // amp_enabled_flag = 0
        push_bits(&mut bits, 0, 1);
        // sample_adaptive_offset_enabled_flag = 0
        push_bits(&mut bits, 0, 1);
        // pcm_enabled_flag = 0
        push_bits(&mut bits, 0, 1);
        // num_short_term_ref_pic_sets = ue(0)
        push_exp_golomb(&mut bits, 0);

        // Pad to byte boundary
        while bits.len() % 8 != 0 {
            bits.push(0);
        }

        let bytes = bits_to_bytes(&bits);
        let sps = parse_hevc_sps(&bytes).unwrap();
        assert_eq!(sps.pic_width, 1920);
        assert_eq!(sps.pic_height, 1080);
        assert_eq!(sps.chroma_format_idc, 1);
        assert_eq!(sps.bit_depth_luma, 8);
        assert_eq!(sps.bit_depth_chroma, 8);
        assert_eq!(sps.vps_id, 0);
        assert_eq!(sps.sps_id, 0);
    }

    #[test]
    fn hevc_slice_type_enum() {
        assert_eq!(HevcSliceType::B as u8, 0);
        assert_eq!(HevcSliceType::P as u8, 1);
        assert_eq!(HevcSliceType::I as u8, 2);
    }

    #[test]
    fn hevc_frame_dimensions_from_sps() {
        let sps = HevcSps {
            sps_id: 0,
            vps_id: 0,
            max_sub_layers: 1,
            chroma_format_idc: 1,
            pic_width: 3840,
            pic_height: 2160,
            bit_depth_luma: 10,
            bit_depth_chroma: 10,
            log2_max_pic_order_cnt: 8,
            log2_min_cb_size: 3,
            log2_diff_max_min_cb_size: 3,
            log2_min_transform_size: 2,
            log2_diff_max_min_transform_size: 3,
            max_transform_hierarchy_depth_inter: 1,
            max_transform_hierarchy_depth_intra: 1,
            sample_adaptive_offset_enabled: true,
            pcm_enabled: false,
            num_short_term_ref_pic_sets: 0,
            long_term_ref_pics_present: false,
            sps_temporal_mvp_enabled: true,
            strong_intra_smoothing_enabled: true,
        };
        let (w, h) = hevc_frame_dimensions(&sps);
        assert_eq!(w, 3840);
        assert_eq!(h, 2160);
    }

    // -- Scaling list parsing tests -----------------------------------------

    #[test]
    fn hevc_sps_with_scaling_list_data() {
        // Build SPS with scaling_list_enabled=1 and scaling_list_data_present=1,
        // then provide scaling list data where every matrix uses pred_mode_flag=0
        // with delta=0 (copy from default).
        let mut bits = Vec::new();

        // VPS/sub-layer/profile-tier-level header (same as hevc_sps_dimensions)
        push_bits(&mut bits, 0, 4); // vps_id
        push_bits(&mut bits, 0, 3); // max_sub_layers_minus1
        push_bits(&mut bits, 1, 1); // temporal_id_nesting
        // profile_tier_level
        push_bits(&mut bits, 0, 8);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 16);
        push_bits(&mut bits, 0, 8);

        push_exp_golomb(&mut bits, 0); // sps_id
        push_exp_golomb(&mut bits, 1); // chroma_format_idc
        push_exp_golomb(&mut bits, 64); // pic_width
        push_exp_golomb(&mut bits, 64); // pic_height
        push_bits(&mut bits, 0, 1); // no conformance window
        push_exp_golomb(&mut bits, 0); // bit_depth_luma_minus8
        push_exp_golomb(&mut bits, 0); // bit_depth_chroma_minus8
        push_exp_golomb(&mut bits, 0); // log2_max_poc_minus4
        push_bits(&mut bits, 1, 1); // sub_layer_ordering_info_present
        push_exp_golomb(&mut bits, 0);
        push_exp_golomb(&mut bits, 0);
        push_exp_golomb(&mut bits, 0);
        push_exp_golomb(&mut bits, 0); // log2_min_cb_size_minus3
        push_exp_golomb(&mut bits, 3); // log2_diff_max_min_cb_size
        push_exp_golomb(&mut bits, 0); // log2_min_transform_size_minus2
        push_exp_golomb(&mut bits, 3); // log2_diff_max_min_transform_size
        push_exp_golomb(&mut bits, 0); // max_transform_hierarchy_depth_inter
        push_exp_golomb(&mut bits, 0); // max_transform_hierarchy_depth_intra

        // scaling_list_enabled_flag = 1
        push_bits(&mut bits, 1, 1);
        // scaling_list_data_present_flag = 1
        push_bits(&mut bits, 1, 1);

        // scaling_list_data(): for each sizeId 0..3, each matrixId:
        // pred_mode_flag = 0, pred_matrix_id_delta = ue(0)
        // sizeId 0: 6 matrices
        for _ in 0..6 {
            push_bits(&mut bits, 0, 1); // pred_mode_flag = 0
            push_exp_golomb(&mut bits, 0); // delta = 0
        }
        // sizeId 1: 6 matrices
        for _ in 0..6 {
            push_bits(&mut bits, 0, 1);
            push_exp_golomb(&mut bits, 0);
        }
        // sizeId 2: 6 matrices
        for _ in 0..6 {
            push_bits(&mut bits, 0, 1);
            push_exp_golomb(&mut bits, 0);
        }
        // sizeId 3: 2 matrices
        for _ in 0..2 {
            push_bits(&mut bits, 0, 1);
            push_exp_golomb(&mut bits, 0);
        }

        // Continue with remaining SPS fields
        push_bits(&mut bits, 0, 1); // amp_enabled
        push_bits(&mut bits, 0, 1); // sample_adaptive_offset_enabled
        push_bits(&mut bits, 0, 1); // pcm_enabled
        push_exp_golomb(&mut bits, 0); // num_short_term_ref_pic_sets

        while bits.len() % 8 != 0 {
            bits.push(0);
        }

        let bytes = bits_to_bytes(&bits);
        let sps = parse_hevc_sps(&bytes).unwrap();
        assert_eq!(sps.pic_width, 64);
        assert_eq!(sps.pic_height, 64);
    }

    #[test]
    fn hevc_scaling_list_with_explicit_coeffs() {
        // Test scaling list parsing where pred_mode_flag=1 (explicit coefficients).
        // Build a standalone scaling_list_data bitstream and parse it.
        let mut bits = Vec::new();

        // sizeId 0: 6 matrices, coefNum = min(64, 1<<(4+0)) = 16
        for _ in 0..6 {
            push_bits(&mut bits, 1, 1); // pred_mode_flag = 1
            // sizeId 0 <= 1, so no dc_coef
            for _ in 0..16 {
                push_signed_exp_golomb(&mut bits, 0); // delta_coef = 0
            }
        }
        // sizeId 1: 6 matrices, coefNum = min(64, 1<<(4+2)) = 64
        for _ in 0..6 {
            push_bits(&mut bits, 1, 1);
            // sizeId 1 <= 1, so no dc_coef
            for _ in 0..64 {
                push_signed_exp_golomb(&mut bits, 0);
            }
        }
        // sizeId 2: 6 matrices, coefNum = min(64, 1<<(4+4)) = 64
        for _ in 0..6 {
            push_bits(&mut bits, 1, 1);
            push_signed_exp_golomb(&mut bits, 0); // dc_coef_minus8 (sizeId > 1)
            for _ in 0..64 {
                push_signed_exp_golomb(&mut bits, 0);
            }
        }
        // sizeId 3: 2 matrices, coefNum = min(64, 1<<(4+6)) = 64
        for _ in 0..2 {
            push_bits(&mut bits, 1, 1);
            push_signed_exp_golomb(&mut bits, 0); // dc_coef_minus8
            for _ in 0..64 {
                push_signed_exp_golomb(&mut bits, 0);
            }
        }

        while bits.len() % 8 != 0 {
            bits.push(0);
        }

        let bytes = bits_to_bytes(&bits);
        let mut reader = BitstreamReader::new(&bytes);
        let result = skip_scaling_list_data(&mut reader);
        assert!(result.is_ok());
    }

    // -- Intra prediction tests ---------------------------------------------

    #[test]
    fn intra_dc_prediction_4x4() {
        let top = [100i16, 100, 100, 100];
        let left = [200i16, 200, 200, 200];
        let mut out = [0i16; 16];
        intra_predict_dc(&top, &left, 4, &mut out);
        // DC = (4*100 + 4*200 + 4) / 8 = 1204/8 = 150
        for v in &out {
            assert_eq!(*v, 150);
        }
    }

    #[test]
    fn intra_dc_prediction_8x8() {
        let top = [128i16; 8];
        let left = [128i16; 8];
        let mut out = [0i16; 64];
        intra_predict_dc(&top, &left, 8, &mut out);
        for v in &out {
            assert_eq!(*v, 128);
        }
    }

    #[test]
    fn intra_planar_prediction_4x4() {
        let top = [100i16, 100, 100, 100];
        let left = [100i16, 100, 100, 100];
        let mut out = [0i16; 16];
        intra_predict_planar(&top, &left, 100, 100, 4, &mut out);
        // With uniform neighbours, all outputs should be 100
        for v in &out {
            assert_eq!(*v, 100);
        }
    }

    #[test]
    fn intra_angular_horizontal() {
        let top = [50i16; 4];
        let left = [200i16, 201, 202, 203];
        let mut out = [0i16; 16];
        intra_predict_angular(&top, &left, 10, 4, &mut out);
        // Mode 10 is horizontal-like, each row should equal left[y]
        for y in 0..4 {
            for x in 0..4 {
                assert_eq!(out[y * 4 + x], left[y]);
            }
        }
    }

    #[test]
    fn intra_angular_vertical() {
        let top = [200i16, 201, 202, 203];
        let left = [50i16; 4];
        let mut out = [0i16; 16];
        intra_predict_angular(&top, &left, 26, 4, &mut out);
        // Mode 26 is vertical-like, each column should equal top[x]
        for y in 0..4 {
            for x in 0..4 {
                assert_eq!(out[y * 4 + x], top[x]);
            }
        }
    }

    // -- Transform roundtrip tests ------------------------------------------

    #[test]
    fn hevc_dst_4x4_dc_roundtrip() {
        // DC coefficient — DST is asymmetric, so outputs are not uniform.
        // Use a large enough value to survive the >>12 shift in the column pass.
        let mut coeffs = [0i32; 16];
        coeffs[0] = 16384;
        let mut out = [0i32; 16];
        hevc_inverse_dst_4x4(&coeffs, &mut out);
        // At least some outputs should be non-zero with a large DC input.
        let any_nonzero = out.iter().any(|&v| v != 0);
        assert!(any_nonzero, "DST output should have non-zero values");
    }

    #[test]
    fn hevc_dct_4x4_dc_roundtrip() {
        let mut coeffs = [0i32; 16];
        coeffs[0] = 256;
        let mut out = [0i32; 16];
        hevc_inverse_dct_4x4(&coeffs, &mut out);
        // DC coefficient should produce uniform output
        let first = out[0];
        assert!(first != 0);
        // All values should be equal for DC-only input
        for v in &out {
            assert_eq!(*v, first);
        }
    }

    #[test]
    fn hevc_dct_4x4_zero_input() {
        let coeffs = [0i32; 16];
        let mut out = [99i32; 16];
        hevc_inverse_dct_4x4(&coeffs, &mut out);
        for v in &out {
            assert_eq!(*v, 0);
        }
    }

    #[test]
    fn hevc_dct_8x8_dc_coefficient() {
        let mut coeffs = [0i32; 64];
        coeffs[0] = 128;
        let mut out = [0i32; 64];
        hevc_inverse_dct_8x8(&coeffs, &mut out);
        let first = out[0];
        assert!(first != 0);
        // All should be equal for DC
        for v in &out {
            assert_eq!(*v, first);
        }
    }

    // -- Coding tree / HevcDecoder tests ------------------------------------

    #[test]
    fn decode_coding_tree_splits_correctly() {
        let mut results = Vec::new();
        // 64x64 CTU, max_depth=3, should split down to 8x8
        decode_coding_tree(0, 0, 6, 0, 3, 26, 64, 64, &mut results);
        // 64 -> 32 -> 16 -> 8: 4^3 = 64 leaf CUs
        assert_eq!(results.len(), 64);
        for cu in &results {
            assert_eq!(cu.size, 8);
            assert_eq!(cu.pred_mode, HevcPredMode::Intra);
        }
    }

    #[test]
    fn decode_coding_tree_boundary_clipping() {
        let mut results = Vec::new();
        // 48x48 picture, 64x64 CTU at origin
        decode_coding_tree(0, 0, 6, 0, 3, 26, 48, 48, &mut results);
        // Should produce CUs, some may be at boundary but none outside
        assert!(!results.is_empty());
        for cu in &results {
            assert!(cu.x < 48);
            assert!(cu.y < 48);
        }
    }

    #[test]
    fn hevc_decoder_new_and_nal_routing() {
        let mut decoder = HevcDecoder::new();
        assert!(decoder.sps().is_none());
        assert!(decoder.pps().is_none());

        // Too-short NAL should error
        let result = decoder.decode_nal(&[0x00]);
        assert!(result.is_err());

        // Unknown NAL type should return Ok(None)
        // NAL header: nal_unit_type in bits [6:1] of first byte
        // Type 35 (AUD) = 0b100011 -> first byte = 0b0_100011_0 = 0x46
        let aud_nal = [0x46, 0x01];
        let result = decoder.decode_nal(&aud_nal);
        assert!(result.unwrap().is_none());
    }

    #[test]
    fn hevc_decoder_idr_without_sps_errors() {
        let mut decoder = HevcDecoder::new();
        // IDR_W_RADL type = 19 -> first byte bits [6:1] = 19 = 0b010011
        // first byte = 0b0_010011_0 = 0x26
        let idr_nal = [0x26, 0x01, 0x00];
        let result = decoder.decode_nal(&idr_nal);
        assert!(result.is_err());
    }

    #[test]
    fn hevc_intra_mode_from_index() {
        assert_eq!(HevcIntraMode::from_index(0), Some(HevcIntraMode::Planar));
        assert_eq!(HevcIntraMode::from_index(1), Some(HevcIntraMode::Dc));
        assert_eq!(HevcIntraMode::from_index(2), Some(HevcIntraMode::Angular2));
        assert_eq!(
            HevcIntraMode::from_index(34),
            Some(HevcIntraMode::Angular34)
        );
        assert_eq!(HevcIntraMode::from_index(35), None);
    }

    #[test]
    fn hevc_dequant_basic() {
        let mut coeffs = [100i32; 16];
        hevc_dequant(&mut coeffs, 26, 8, 2);
        // After dequant, values should have changed
        assert!(coeffs[0] != 100);
    }
}