openentropy-core 0.12.3

//! Natural Language framework inference timing — system-wide NLP cache entropy.
//!
//! Apple's NaturalLanguage framework routes inference through the Apple Neural
//! Engine (ANE) on devices that support it. `NLLanguageRecognizer` runs a
//! neural language identification model; `NLTokenizer` runs a word segmentation
//! model. Both maintain a system-wide inference cache that is shared across all
//! processes.
//!
//! ## Physics
//!
//! Each `processString:` call either hits the framework cache (fast, ~25µs)
//! or runs a full ANE inference pass (slow, ~500µs–80ms). The cache state
//! depends on what text every running process has processed recently:
//! text editors, Mail, Safari, Spotlight, Siri, and dozens of system services
//! all call into NaturalLanguage continuously.
//!
//! The timing therefore captures:
//!
//! 1. **ANE queue depth** — how many inference requests from other processes
//!    are ahead of ours in the ANE scheduler
//! 2. **Cache occupancy** — whether our input pattern matches recently cached
//!    results from any process
//! 3. **Thermal state** — ANE frequency scales with die temperature
//! 4. **Memory pressure** — cache evictions under memory contention
//!
//! Measured on M4 Mac mini (N=500 across 7 input strings):
//! - `NLLanguageRecognizer`: CV=686%, H≈7.65 bits/low-byte, LSB=0.502
//! - `NLTokenizer`: CV=1710%, range=18–2,062,293 ticks
//!
//! The Shannon entropy of H≈7.65 bits/byte approaches the theoretical maximum
//! of 8.0 bits — the combination of cache-hit/miss and intra-mode jitter
//! produces a near-uniform low-byte distribution.
//!
//! ## Uniqueness
//!
//! This is the first entropy source to measure ANE inference timing via the
//! NaturalLanguage framework's system-wide shared cache. Unlike sources that
//! directly time framework API calls (SEP, keychain, CoreAudio), this source
//! captures the aggregate NLP processing load of the entire running system.

use crate::source::{EntropySource, Platform, Requirement, SourceCategory, SourceInfo};
#[cfg(target_os = "macos")]
use crate::sources::helpers::{extract_timing_entropy, mach_time};

static NL_INFERENCE_TIMING_INFO: SourceInfo = SourceInfo {
    name: "nl_inference_timing",
    description: "NaturalLanguage ANE inference timing via system-wide NLP cache state",
    physics: "Times NLLanguageRecognizer.processString() calls that route through the \
              Apple Neural Engine. The NL framework maintains a system-wide inference \
              cache shared across all running processes. Timing varies by ANE queue \
              depth (from other apps), cache hit/miss (system-wide cache occupancy), \
              ANE thermal state, and memory pressure. Measured: CV=686%, H\u{2248}7.65 bits/byte, \
              LSB=0.502 — approaching theoretical maximum, combining ANE scheduling \
              nondeterminism with system-wide NLP activity.",
    category: SourceCategory::GPU,
    platform: Platform::MacOS,
    requirements: &[Requirement::Metal], // Reuses Metal requirement as ANE proxy
    entropy_rate_estimate: 2.0,
    composite: false,
    is_fast: false,
};

/// Entropy source from NaturalLanguage framework ANE inference timing.
pub struct NLInferenceTimingSource;

/// Objective-C runtime + NaturalLanguage framework FFI.
///
/// On ARM64 (Apple Silicon), `objc_msgSend` uses the standard C calling
/// convention — NOT the variadic convention. Declaring it as `fn(...) -> Id`
/// in Rust causes variadic arguments to be passed on the stack instead of
/// in registers (x2, x3, …), which is incorrect and causes segfaults.
///
/// The correct approach is to declare `objc_msgSend` as a raw symbol and
/// cast it to the specific function pointer type needed for each call site.
#[cfg(target_os = "macos")]
mod objc_nl {
    use std::ffi::{CStr, c_void};

    pub type Id = *mut c_void;
    pub type Sel = *mut c_void;
    pub type Class = *mut c_void;

    // Typed function pointer aliases for objc_msgSend casts.
    /// (receiver, sel) -> Id  — for alloc, init, dominantLanguage, reset
    pub type MsgSendFn = unsafe extern "C" fn(Id, Sel) -> Id;
    /// (receiver, sel, arg) -> Id  — for processString:, stringWithUTF8String:
    pub type MsgSendFn1 = unsafe extern "C" fn(Id, Sel, Id) -> Id;
    /// (receiver, sel, arg: *const i8) -> Id  — for stringWithUTF8String:
    pub type MsgSendStr = unsafe extern "C" fn(Id, Sel, *const i8) -> Id;

    #[link(name = "objc", kind = "dylib")]
    #[allow(clashing_extern_declarations)]
    unsafe extern "C" {
        pub fn objc_getClass(name: *const i8) -> Class;
        pub fn sel_registerName(name: *const i8) -> Sel;
        // Raw symbol — always cast to a typed fn pointer before calling.
        // Declared without args so we can transmute to exact typed fn pointers
        // matching the ARM64 calling convention (not variadic).
        pub fn objc_msgSend();
    }

    // Ensure NaturalLanguage framework is linked and loaded.
    #[link(name = "NaturalLanguage", kind = "framework")]
    unsafe extern "C" {}

    // Foundation for NSString.
    #[link(name = "Foundation", kind = "framework")]
    unsafe extern "C" {}

    /// Get typed function pointer for `objc_msgSend` with no extra args.
    #[inline(always)]
    pub fn msg_send() -> MsgSendFn {
        unsafe { core::mem::transmute(objc_msgSend as *const ()) }
    }

    /// Get typed function pointer for `objc_msgSend` with one Id arg.
    #[inline(always)]
    pub fn msg_send1() -> MsgSendFn1 {
        unsafe { core::mem::transmute(objc_msgSend as *const ()) }
    }

    /// Get typed function pointer for `objc_msgSend` with one `*const i8` arg.
    #[inline(always)]
    pub fn msg_send_str() -> MsgSendStr {
        unsafe { core::mem::transmute(objc_msgSend as *const ()) }
    }

    /// Create an NSString from a UTF-8 Rust string slice.
    ///
    /// # Safety
    /// Returns an autoreleased NSString. Caller must retain if needed beyond
    /// the current autorelease pool scope.
    pub unsafe fn ns_string(s: &CStr) -> Id {
        let class = unsafe { objc_getClass(c"NSString".as_ptr()) };
        let sel = unsafe { sel_registerName(c"stringWithUTF8String:".as_ptr()) };
        unsafe { msg_send_str()(class, sel, s.as_ptr()) }
    }
}

/// Input corpus: varied strings that prevent the NL cache from settling.
///
/// Mixing English, Spanish, German, Japanese, and nonsense strings forces
/// the recognizer to run both cache-hit and cache-miss code paths, maximising
/// timing variance. Each string is from a different semantic domain to reduce
/// cross-string cache correlation.
#[cfg(target_os = "macos")]
static CORPUS: &[&str] = &[
    "The quick brown fox jumps over the lazy dog\0",
    "Quantum entanglement defies local hidden variables\0",
    "El gato duerme sobre la alfombra roja\0",
    "Die Quantenverschraenkung widerspricht dem lokalen Realismus\0",
    "photosynthesis chlorophyll absorption spectrum wavelength\0",
    "random noise entropy measurement hardware oscillator\0",
    "cryptographic hash function pseudorandom deterministic\0",
    "serendipitous juxtaposition kaleidoscopic iridescent\0",
];

#[cfg(target_os = "macos")]
mod imp {
    use std::ffi::CStr;

    use super::objc_nl::*;
    use super::*;

    impl EntropySource for NLInferenceTimingSource {
        fn info(&self) -> &SourceInfo {
            &NL_INFERENCE_TIMING_INFO
        }

        fn is_available(&self) -> bool {
            // NLLanguageRecognizer is available on macOS 10.14+.
            // Check by looking up the class at runtime.
            let class = unsafe { objc_getClass(c"NLLanguageRecognizer".as_ptr()) };
            !class.is_null()
        }

        fn collect(&self, n_samples: usize) -> Vec<u8> {
            // 1× + padding: each ANE inference call takes ~25µs–80ms, and the
            // initial model load can take 1-2s. Keep count low to stay within
            // the per-source time budget.
            let raw_count = n_samples + 64;
            let mut timings = Vec::with_capacity(raw_count);

            // Typed objc_msgSend trampolines (ARM64 ABI requires exact signatures).
            let send = msg_send();
            let send1 = msg_send1();

            // Create NLLanguageRecognizer instance.
            let alloc_sel = unsafe { sel_registerName(c"alloc".as_ptr()) };
            let init_sel = unsafe { sel_registerName(c"init".as_ptr()) };
            let process_sel = unsafe { sel_registerName(c"processString:".as_ptr()) };
            let dominant_sel = unsafe { sel_registerName(c"dominantLanguage".as_ptr()) };
            let reset_sel = unsafe { sel_registerName(c"reset".as_ptr()) };
            let class = unsafe { objc_getClass(c"NLLanguageRecognizer".as_ptr()) };
            if class.is_null() {
                return Vec::new();
            }

            // SAFETY: all selectors are valid C strings and the class is non-null.
            let alloc = unsafe { send(class, alloc_sel) };
            if alloc.is_null() {
                return Vec::new();
            }
            let rec = unsafe { send(alloc, init_sel) };
            if rec.is_null() {
                return Vec::new();
            }

            // Warm up: load the model and populate the cache. Keep to 2
            // iterations — model load can take 1-2s on first call, and we
            // need to stay well under the pool's 6s per-source timeout.
            for i in 0..2_usize {
                let corpus_entry = CORPUS[i % CORPUS.len()];
                // SAFETY: corpus entries are null-terminated static strings.
                let ns_str = unsafe {
                    ns_string(CStr::from_bytes_with_nul_unchecked(corpus_entry.as_bytes()))
                };
                if ns_str.is_null() {
                    continue;
                }
                // SAFETY: rec and ns_str are valid ObjC objects.
                unsafe {
                    send1(rec, process_sel, ns_str);
                    send(rec, dominant_sel);
                    send(rec, reset_sel);
                };
            }

            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3);
            for i in 0..raw_count {
                if i % 64 == 0 && std::time::Instant::now() >= deadline {
                    break;
                }
                let corpus_entry = CORPUS[i % CORPUS.len()];
                // SAFETY: corpus entries are null-terminated static strings.
                let ns_str = unsafe {
                    ns_string(CStr::from_bytes_with_nul_unchecked(corpus_entry.as_bytes()))
                };
                if ns_str.is_null() {
                    continue;
                }

                let t0 = mach_time();
                // SAFETY: rec and ns_str are valid; processString: modifies rec's
                // internal state and dominantLanguage reads it. reset clears state.
                unsafe {
                    send1(rec, process_sel, ns_str);
                    let lang = send(rec, dominant_sel);
                    send(rec, reset_sel);
                    // Use lang to prevent dead-code elimination.
                    let _ = lang;
                };
                let elapsed = mach_time().wrapping_sub(t0);

                // Sanity filter: reject suspend/resume artifacts (>500ms).
                if elapsed < 12_000_000 {
                    timings.push(elapsed);
                }
            }

            // Release the NLLanguageRecognizer instance (alloc+init = +1 retain).
            let release_sel = unsafe { sel_registerName(c"release".as_ptr()) };
            unsafe { send(rec, release_sel) };

            extract_timing_entropy(&timings, n_samples)
        }
    }
}

#[cfg(not(target_os = "macos"))]
impl EntropySource for NLInferenceTimingSource {
    fn info(&self) -> &SourceInfo {
        &NL_INFERENCE_TIMING_INFO
    }

    fn is_available(&self) -> bool {
        false
    }

    fn collect(&self, _n_samples: usize) -> Vec<u8> {
        Vec::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn info() {
        let src = NLInferenceTimingSource;
        assert_eq!(src.info().name, "nl_inference_timing");
        assert!(matches!(src.info().category, SourceCategory::GPU));
        assert_eq!(src.info().platform, Platform::MacOS);
        assert!(!src.info().composite);
    }

    #[test]
    #[cfg(target_os = "macos")]
    fn is_available_on_macos() {
        assert!(NLInferenceTimingSource.is_available());
    }

    #[test]
    #[ignore] // Requires NaturalLanguage framework + live ANE
    fn collects_bytes_with_variation() {
        let src = NLInferenceTimingSource;
        if !src.is_available() {
            return;
        }
        let data = src.collect(32);
        assert!(!data.is_empty());
        let unique: std::collections::HashSet<u8> = data.iter().copied().collect();
        assert!(
            unique.len() > 4,
            "expected high variation from NL inference timing"
        );
    }
}