blvm-consensus 0.1.10

//! BLVM Runtime Optimization Passes
//!
//! Additional optimization passes for 10-30% performance gains
//!
//! This module provides runtime optimization passes:
//! - Constant folding (pre-computed constants)
//! - Bounds check optimization (proven bounds)
//! - Inlining hints (hot function markers)
//! - Memory layout optimization (cache-friendly structures)
//!
//! Reference: Orange Paper Section 13.1 - Performance Considerations

use crate::constants::*;

/// Pre-computed constants for constant folding optimization
///
/// These constants are computed at compile time to avoid runtime computation
/// in hot paths. Reference: BLVM Optimization Pass 2 - Constant Folding
#[cfg(feature = "production")]
pub mod precomputed_constants {
    use super::*;

    /// Pre-computed: 2^64 - 1 (used for wrapping arithmetic checks)
    pub const U64_MAX: u64 = u64::MAX;

    /// Pre-computed: MAX_MONEY as u64 (for comparisons)
    pub const MAX_MONEY_U64: u64 = MAX_MONEY as u64;

    /// Pre-computed: Inverse of SATOSHIS_PER_BTC (for BTC conversion)
    pub const BTC_PER_SATOSHI: f64 = 1.0 / (SATOSHIS_PER_BTC as f64);

    /// Pre-computed: 2^32 - 1 (for 32-bit wrapping checks)
    pub const U32_MAX: u32 = u32::MAX;

    /// Pre-computed: Number of satoshis in 1 BTC (for readability)
    pub const ONE_BTC_SATOSHIS: i64 = SATOSHIS_PER_BTC;
}

/// Memory layout optimization: Cache-friendly hash array
///
/// Optimizes hash array access for cache locality.
/// Uses 32-byte aligned structures for better cache performance.
///
/// This structure ensures each hash is aligned to a 32-byte boundary, which:
/// - Reduces cache line splits
/// - Improves prefetching behavior
/// - Better fits modern CPU cache architectures (64-byte cache lines)
///
/// Reference: BLVM Optimization Pass 3 - Memory Layout Optimization
/// Cache-aligned hash for optimized batch operations
#[repr(align(32))]
#[derive(Clone)]
pub struct CacheAlignedHash([u8; 32]);

impl CacheAlignedHash {
    #[inline]
    pub fn new(hash: [u8; 32]) -> Self {
        Self(hash)
    }

    #[inline]
    pub fn as_bytes(&self) -> &[u8; 32] {
        &self.0
    }
}

/// Memory prefetching optimization
///
/// Provides platform-specific prefetch hints to improve cache performance
/// for sequential memory accesses. Used before batch UTXO lookups and
/// other sequential data structure traversals.
///
/// Reference: BLVM Optimization Pass 1.3 - Memory Prefetching
#[cfg(feature = "production")]
pub mod prefetch {
    /// Prefetch data for read access
    ///
    /// Hints the CPU to prefetch data into cache before it's needed.
    /// This improves performance for sequential memory access patterns.
    ///
    /// # Safety
    /// The pointer must be valid, but it doesn't need to be dereferenceable
    /// at the time of the call. The prefetch is a hint and may be ignored.
    #[cfg(target_arch = "x86_64")]
    #[inline(always)]
    pub unsafe fn prefetch_read(ptr: *const i8) {
        use std::arch::x86_64::{_mm_prefetch, _MM_HINT_T0};
        _mm_prefetch(ptr, _MM_HINT_T0);
    }

    #[cfg(target_arch = "aarch64")]
    #[inline(always)]
    pub unsafe fn prefetch_read(ptr: *const i8) {
        use std::arch::aarch64::_prefetch;
        _prefetch(ptr, 0, 0); // Read, temporal locality
    }

    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    #[inline(always)]
    pub unsafe fn prefetch_read(_ptr: *const i8) {
        // No-op for unsupported architectures
    }

    /// Prefetch a slice of data for sequential access
    ///
    /// Prefetches the next cache line(s) of data to improve sequential access.
    /// Safe wrapper around prefetch_read that works with slices.
    #[inline(always)]
    pub fn prefetch_slice<T>(slice: &[T], index: usize) {
        if index < slice.len() {
            unsafe {
                let ptr = slice.as_ptr().add(index) as *const i8;
                prefetch_read(ptr);
            }
        }
    }

    /// Prefetch multiple elements ahead in a slice
    ///
    /// Prefetches elements at `index + offset` to prepare for future access.
    /// Useful for sequential loops where you know you'll access elements ahead.
    #[inline(always)]
    pub fn prefetch_ahead<T>(slice: &[T], index: usize, offset: usize) {
        let prefetch_index = index.saturating_add(offset);
        prefetch_slice(slice, prefetch_index);
    }
}

/// Memory layout optimization: Compact stack frame
///
/// Compact stack frame for script execution optimization
/// Optimized stack frame structure for cache locality.
#[repr(C, packed)]
pub struct CompactStackFrame {
    pub opcode: u8,
    pub flags: u32,
    pub script_offset: u16,
    pub stack_height: u16,
}

impl CompactStackFrame {
    #[inline]
    pub fn new(opcode: u8, flags: u32, script_offset: u16, stack_height: u16) -> Self {
        Self {
            opcode,
            flags,
            script_offset,
            stack_height,
        }
    }
}

/// Inlining hints for hot functions
///
/// Functions marked with HOT_INLINE should be aggressively inlined.
/// These are called in tight loops and benefit from inlining.
#[macro_export]
#[cfg(feature = "production")]
macro_rules! hot_inline {
    () => {
        #[inline(always)]
    };
}

/// Constant folding: Pre-compute common hash results
///
/// Caches common hash pre-images for constant folding.
#[cfg(feature = "production")]
pub mod constant_folding {
    /// Pre-computed: SHA256 of empty string
    pub const EMPTY_STRING_HASH: [u8; 32] = [
        0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9,
        0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52,
        0xb8, 0x55,
    ];

    /// Pre-computed: Double SHA256 of empty string
    pub const EMPTY_STRING_DOUBLE_HASH: [u8; 32] = [
        0x5d, 0xf6, 0xe0, 0xe2, 0x76, 0x13, 0x59, 0xf3, 0x73, 0x9a, 0x1c, 0x6f, 0x87, 0x40, 0x64,
        0x0a, 0xf1, 0x2e, 0xc7, 0xc3, 0x72, 0x4a, 0x5c, 0x2c, 0xa5, 0xf3, 0x0f, 0x26, 0x60, 0x87,
        0x7e, 0x6b,
    ];

    /// Check if input matches empty string hash (constant folding)
    #[inline(always)]
    pub fn is_empty_hash(hash: &[u8; 32]) -> bool {
        *hash == EMPTY_STRING_HASH
    }

    /// Check if input matches empty string double hash (constant folding)
    #[inline(always)]
    pub fn is_empty_double_hash(hash: &[u8; 32]) -> bool {
        *hash == EMPTY_STRING_DOUBLE_HASH
    }

    /// Constant-fold: Check if hash is zero (all zeros)
    #[inline(always)]
    pub fn is_zero_hash(hash: &[u8; 32]) -> bool {
        hash.iter().all(|&b| b == 0)
    }
}

/// Dead code elimination markers
///
/// Functions/constants marked with this can be eliminated if unused.
#[cfg(feature = "production")]
#[allow(dead_code)]
pub mod dead_code_elimination {
    /// Mark code for dead code elimination analysis
    /// This is a marker function - the compiler can eliminate unused paths
    #[inline(never)]
    #[cold]
    pub fn mark_unused() {
        // This function never executes in production builds
        // It's a marker for dead code elimination pass
    }

    /// Hint to compiler that branch is unlikely (dead code elimination)
    ///
    /// Note: In stable Rust, this is a no-op but serves as documentation
    /// for future optimization opportunities (unstable `likely`/`unlikely` intrinsics).
    #[inline(always)]
    pub fn unlikely(condition: bool) -> bool {
        // Stable Rust doesn't have likely/unlikely intrinsics
        // This is a placeholder for future optimization
        condition
    }
}

/// SIMD Vectorization: Batch hash operations
///
/// Provides batch hash processing for parallel hash operations.
/// Leverages existing SIMD in sha2 crate (asm feature) + Rayon for CPU-core parallelization.
///
/// Provides batch functions for:
/// - SHA256 and double SHA256 (Bitcoin standard)
/// - RIPEMD160 and HASH160 (OP_HASH160)
///
/// Uses chunked processing for better cache locality and parallelizes across CPU cores
/// when batch size is large enough (≥8 items).
///
/// Reference: BLVM Optimization Pass 5 - SIMD Vectorization
#[cfg(feature = "production")]
pub mod simd_vectorization {
    use crate::crypto::OptimizedSha256;
    use digest::Digest;
    use ripemd::Ripemd160;

    /// Minimum batch size for parallelization (overhead not worth it for smaller batches).
    /// batch_sha256 uses OptimizedSha256 (SHA-NI when available) for consistency with batch_double_sha256_aligned.
    const PARALLEL_THRESHOLD: usize = 8;

    /// Chunk size for cache-friendly processing. Hardware-derived via ibd_tuning.
    #[inline]
    fn chunk_size() -> usize {
        blvm_primitives::ibd_tuning::hash_batch_chunk_size()
    }

    /// Batch SHA256: Compute SHA256 for multiple independent inputs
    ///
    /// # Arguments
    /// * `inputs` - Slice of byte slices to hash
    ///
    /// # Returns
    /// Vector of 32-byte hashes, one per input (in same order)
    ///
    /// # Performance
    /// - Small batches (< 4 items): Sequential (overhead not worth parallelization)
    /// - Medium batches (4-7 items): Chunked sequential
    /// - Large batches (≥8 items): Multi-core parallelization with Rayon
    ///
    /// # Optimizations
    /// - Uses sha2 crate with "asm" feature for optimized assembly
    /// - For large batches, leverages Rayon for multi-core parallelization
    /// - AVX2 batch optimization available via `crypto::avx2_batch` module
    pub fn batch_sha256(inputs: &[&[u8]]) -> Vec<[u8; 32]> {
        if inputs.is_empty() {
            return Vec::new();
        }

        // Small batches: sequential processing. Use OptimizedSha256 (SHA-NI when available).
        if inputs.len() < 4 {
            let hasher = OptimizedSha256::new();
            return inputs.iter().map(|input| hasher.hash(input)).collect();
        }

        // Medium batches: chunked sequential processing
        if inputs.len() < PARALLEL_THRESHOLD {
            let hasher = OptimizedSha256::new();
            let mut results = Vec::with_capacity(inputs.len());
            for chunk in inputs.chunks(chunk_size()) {
                for input in chunk {
                    results.push(hasher.hash(input));
                }
            }
            return results;
        }

        // Large batches: Try AVX2 first, then fallback to multi-core parallelization
        #[cfg(target_arch = "x86_64")]
        {
            use crate::crypto::sha256_avx2;
            if sha256_avx2::is_avx2_available() {
                // Use AVX2 batch processing for chunks of 8
                use crate::crypto::avx2_batch;
                return avx2_batch::batch_sha256_avx2(inputs);
            }
        }

        // Fallback: Multi-core parallelization using Rayon. Each worker gets OptimizedSha256.
        use rayon::prelude::*;

        inputs
            .par_chunks(chunk_size())
            .map(|chunk| {
                let hasher = OptimizedSha256::new();
                chunk
                    .iter()
                    .map(|input| hasher.hash(input))
                    .collect::<Vec<_>>()
            })
            .flatten()
            .collect()
    }

    /// Batch double SHA256: Compute SHA256(SHA256(x)) for multiple inputs
    ///
    /// This is Bitcoin's standard hash function used for transaction IDs, block hashes, etc.
    ///
    /// # Arguments
    /// * `inputs` - Slice of byte slices to hash
    ///
    /// # Returns
    /// Vector of 32-byte hashes, one per input (in same order)
    pub fn batch_double_sha256(inputs: &[&[u8]]) -> Vec<[u8; 32]> {
        // Use aligned version for better cache performance
        batch_double_sha256_aligned(inputs)
            .into_iter()
            .map(|h| *h.as_bytes())
            .collect()
    }

    /// Batch double SHA256 with cache-aligned output
    ///
    /// Returns cache-aligned hash structures for better memory performance.
    /// Uses 32-byte alignment for optimal cache line utilization.
    ///
    /// # Arguments
    /// * `inputs` - Slice of byte slices to hash
    ///
    /// # Returns
    /// Vector of cache-aligned 32-byte hashes, one per input (in same order)
    pub fn batch_double_sha256_aligned(inputs: &[&[u8]]) -> Vec<super::CacheAlignedHash> {
        if inputs.is_empty() {
            return Vec::new();
        }

        // Small batches: sequential processing (overhead not worth it)
        // Use OptimizedSha256 (SHA-NI when available) instead of sha2
        let hasher = OptimizedSha256::new();
        if inputs.len() < 4 {
            return inputs
                .iter()
                .map(|input| super::CacheAlignedHash::new(hasher.hash256(input)))
                .collect();
        }

        // Medium batches: chunked sequential processing
        if inputs.len() < PARALLEL_THRESHOLD {
            let mut results = Vec::with_capacity(inputs.len());
            for chunk in inputs.chunks(chunk_size()) {
                for input in chunk {
                    results.push(super::CacheAlignedHash::new(hasher.hash256(input)));
                }
            }
            return results;
        }

        // Large batches: parallelized processing using Rayon
        // Each worker gets SHA-NI via OptimizedSha256 (runtime detection)
        use rayon::prelude::*;

        inputs
            .par_chunks(chunk_size())
            .map(|chunk| {
                let hasher = OptimizedSha256::new();
                chunk
                    .iter()
                    .map(|input| super::CacheAlignedHash::new(hasher.hash256(input)))
                    .collect::<Vec<_>>()
            })
            .flatten()
            .collect()
    }

    /// Batch RIPEMD160: Compute RIPEMD160 for multiple inputs
    ///
    /// # Arguments
    /// * `inputs` - Slice of byte slices to hash
    ///
    /// # Returns
    /// Vector of 20-byte hashes, one per input (in same order)
    pub fn batch_ripemd160(inputs: &[&[u8]]) -> Vec<[u8; 20]> {
        if inputs.is_empty() {
            return Vec::new();
        }

        // Small batches: sequential processing
        if inputs.len() < 4 {
            return inputs
                .iter()
                .map(|input| {
                    let hash = Ripemd160::digest(input);
                    let mut result = [0u8; 20];
                    result.copy_from_slice(&hash);
                    result
                })
                .collect();
        }

        // Medium batches: chunked sequential processing
        if inputs.len() < PARALLEL_THRESHOLD {
            let mut results = Vec::with_capacity(inputs.len());
            for chunk in inputs.chunks(chunk_size()) {
                for input in chunk {
                    let hash = Ripemd160::digest(input);
                    let mut result = [0u8; 20];
                    result.copy_from_slice(&hash);
                    results.push(result);
                }
            }
            return results;
        }

        // Large batches: parallelized processing
        // Rayon is enabled via the 'production' feature
        use rayon::prelude::*;

        inputs
            .par_chunks(chunk_size())
            .map(|chunk| {
                chunk
                    .iter()
                    .map(|input| {
                        let hash = Ripemd160::digest(input);
                        let mut result = [0u8; 20];
                        result.copy_from_slice(&hash);
                        result
                    })
                    .collect::<Vec<_>>()
            })
            .flatten()
            .collect()
    }

    /// Batch HASH160: Compute RIPEMD160(SHA256(x)) for multiple inputs
    ///
    /// This is Bitcoin's HASH160 operation (OP_HASH160 in script).
    ///
    /// # Arguments
    /// * `inputs` - Slice of byte slices to hash
    ///
    /// # Returns
    /// Vector of 20-byte hashes, one per input (in same order)
    pub fn batch_hash160(inputs: &[&[u8]]) -> Vec<[u8; 20]> {
        if inputs.is_empty() {
            return Vec::new();
        }

        // Small batches: sequential processing. Use OptimizedSha256 (SHA-NI) for SHA256 part.
        if inputs.len() < 4 {
            let hasher = OptimizedSha256::new();
            return inputs
                .iter()
                .map(|input| {
                    let sha256_hash: [u8; 32] = hasher.hash(input);
                    let ripemd160_hash = Ripemd160::digest(sha256_hash);
                    let mut result = [0u8; 20];
                    result.copy_from_slice(&ripemd160_hash);
                    result
                })
                .collect();
        }

        // Medium batches: chunked sequential processing
        if inputs.len() < PARALLEL_THRESHOLD {
            let hasher = OptimizedSha256::new();
            let mut results = Vec::with_capacity(inputs.len());
            for chunk in inputs.chunks(chunk_size()) {
                for input in chunk {
                    let sha256_hash: [u8; 32] = hasher.hash(input);
                    let ripemd160_hash = Ripemd160::digest(sha256_hash);
                    let mut result = [0u8; 20];
                    result.copy_from_slice(&ripemd160_hash);
                    results.push(result);
                }
            }
            return results;
        }

        // Large batches: parallelized processing. Each worker gets OptimizedSha256.
        use rayon::prelude::*;

        inputs
            .par_chunks(chunk_size())
            .map(|chunk| {
                let hasher = OptimizedSha256::new();
                chunk
                    .iter()
                    .map(|input| {
                        let sha256_hash: [u8; 32] = hasher.hash(input);
                        let ripemd160_hash = Ripemd160::digest(sha256_hash);
                        let mut result = [0u8; 20];
                        result.copy_from_slice(&ripemd160_hash);
                        result
                    })
                    .collect::<Vec<_>>()
            })
            .flatten()
            .collect()
    }
}

#[cfg(feature = "production")]
pub use constant_folding::*;
#[cfg(feature = "production")]
pub use precomputed_constants::*;

/// Proven bounds for runtime optimization
///
/// These bounds are proven by formal verification and can be used
/// for runtime optimizations without additional safety checks.
///
/// Proven runtime bounds for BLVM optimizations
///
/// These bounds have been formally proven and are used for runtime optimizations.
/// Unlike proof-time limits (in `_helpers::proof_limits`), these represent actual
/// Bitcoin limits that have been proven to hold in all cases.
///
/// Reference: BLVM Optimization Pass
#[cfg(feature = "production")]
pub mod proven_bounds {
    use crate::constants::{MAX_INPUTS, MAX_OUTPUTS};

    /// Maximum transaction size (proven by formal verification in transaction.rs)
    pub const MAX_TX_SIZE_PROVEN: usize = 100000; // Bytes

    /// Maximum block size (proven by formal verification in block.rs)
    pub const MAX_BLOCK_SIZE_PROVEN: usize = 4000000; // Bytes (4MB)

    /// Maximum inputs per transaction (proven by formal verification)
    /// References actual Bitcoin limit from constants.rs
    pub const MAX_INPUTS_PROVEN: usize = MAX_INPUTS;

    /// Maximum outputs per transaction (proven by formal verification)
    /// References actual Bitcoin limit from constants.rs
    pub const MAX_OUTPUTS_PROVEN: usize = MAX_OUTPUTS;

    /// Maximum transactions per block (proven by formal verification)
    /// Note: Bitcoin limit is effectively unbounded by consensus rules, but practical limit
    /// is around 10,000 transactions per block based on block size limits.
    pub const MAX_TRANSACTIONS_PROVEN: usize = 10000;

    /// Maximum previous headers for difficulty adjustment (proven by formal verification)
    pub const MAX_PREV_HEADERS_PROVEN: usize = 5;
}

/// Optimized access using proven bounds
///
/// Uses bounds proven by formal verification to optimize runtime access.
/// This is safe because formal proofs guarantee these bounds hold.
///
/// Reference: Formal proofs in transaction.rs, block.rs, mining.rs, pow.rs, etc.
/// These proofs formally verify that certain bounds always hold, allowing us to
/// use optimized access patterns without runtime bounds checks.
#[cfg(feature = "production")]
pub mod optimized_access {
    use super::proven_bounds;

    /// Get element with proven bounds check
    ///
    /// Uses proven maximum sizes to optimize bounds checking.
    /// For transactions proven to have <= MAX_INPUTS_PROVEN inputs,
    /// we can use optimized access patterns.
    ///
    /// # Safety
    /// This function is safe because formal proofs guarantee bounds.
    /// However, it still returns `Option` to handle cases where:
    /// - Runtime bounds differ from proof bounds (should not happen in practice)
    /// - Defensive programming (fail-safe)
    ///
    /// # Panics
    /// Never panics - always returns `None` if out of bounds.
    ///
    /// # Examples
    /// ```rust
    /// use blvm_consensus::optimizations::optimized_access::get_proven;
    /// use blvm_consensus::types::Transaction;
    ///
    /// # let tx = Transaction { version: 1, inputs: vec![].into(), outputs: vec![].into(), lock_time: 0 };
    /// # let index = 0;
    /// if let Some(input) = get_proven(&tx.inputs, index) {
    ///     // Safe to use
    /// }
    /// ```
    #[inline(always)]
    pub fn get_proven<T>(slice: &[T], index: usize) -> Option<&T> {
        // Formal proofs have proven index < MAX_SIZE in various proofs
        // We can use unsafe access for proven-safe indices
        // This is safe because formal proofs guarantee bounds
        if index < slice.len() {
            unsafe { Some(slice.get_unchecked(index)) }
        } else {
            None
        }
    }

    /// Pre-allocate buffer using proven maximum size
    ///
    /// Uses proven maximum sizes to avoid reallocation.
    /// For example, transaction buffers can be pre-sized to MAX_TX_SIZE_PROVEN.
    #[inline(always)]
    pub fn prealloc_proven<T>(max_size: usize) -> Vec<T> {
        // Pre-allocate to proven maximum to avoid reallocation
        Vec::with_capacity(max_size)
    }

    /// Pre-allocate transaction buffer using proven maximum
    #[inline(always)]
    pub fn prealloc_tx_buffer() -> Vec<u8> {
        prealloc_proven::<u8>(proven_bounds::MAX_TX_SIZE_PROVEN)
    }

    /// Pre-allocate block buffer using proven maximum
    #[inline(always)]
    pub fn prealloc_block_buffer() -> Vec<u8> {
        prealloc_proven::<u8>(proven_bounds::MAX_BLOCK_SIZE_PROVEN)
    }

    /// Get element with proven bounds (alias for get_proven for compatibility)
    #[inline(always)]
    pub fn get_proven_by_<T>(slice: &[T], index: usize) -> Option<&T> {
        get_proven(slice, index)
    }
}

/// Alias module for _optimized_access (for backward compatibility)
#[cfg(feature = "production")]
pub mod _optimized_access {
    use super::optimized_access;

    /// Get element with proven bounds
    #[inline(always)]
    pub fn get_proven_by_<T>(slice: &[T], index: usize) -> Option<&T> {
        optimized_access::get_proven(slice, index)
    }
}

/// Re-export prealloc helpers for convenience
#[cfg(feature = "production")]
pub use optimized_access::{prealloc_block_buffer, prealloc_tx_buffer};

/// Reference implementations for equivalence proofs
///
/// These are safe versions of optimized functions, used to prove
/// that optimizations are correct via formal verification.
#[cfg(feature = "production")]
pub mod reference_implementations {
    /// Reference (safe) implementation of get_proven
    /// This is the version we prove equivalence against
    #[inline(always)]
    pub fn get_proven_reference<T>(slice: &[T], index: usize) -> Option<&T> {
        slice.get(index) // Safe version
    }
}

/// Runtime assertions for optimization correctness
///
/// These functions provide runtime checks in debug builds to verify
/// that optimizations match their reference implementations.
#[cfg(all(
    feature = "production",
    any(debug_assertions, feature = "runtime-invariants")
))]
pub mod runtime_assertions {
    use super::optimized_access::get_proven;
    use super::reference_implementations::get_proven_reference;

    /// Checked version of get_proven with runtime assertions
    ///
    /// This function performs runtime checks in debug builds to ensure
    /// the optimized implementation matches the reference implementation.
    #[inline(always)]
    pub fn get_proven_checked<T>(slice: &[T], index: usize) -> Option<&T> {
        let result_optimized = get_proven(slice, index);
        let result_reference = get_proven_reference(slice, index);

        // Runtime check: both must agree
        debug_assert_eq!(
            result_optimized.is_some(),
            result_reference.is_some(),
            "Optimization correctness check failed: optimized and reference disagree on Some/None"
        );

        if let (Some(opt_val), Some(ref_val)) = (result_optimized, result_reference) {
            debug_assert_eq!(
                opt_val as *const T,
                ref_val as *const T,
                "Optimization correctness check failed: optimized and reference return different pointers"
            );
        }

        result_optimized
    }
}