Skip to main content

openentropy_core/sources/gpu/
gpu_divergence.rs

1//! GPU shader thread divergence — intra-warp nondeterminism entropy.
2//!
3//! GPU threads (SIMD groups) should execute in lockstep but don't due to:
4//! - Warp divergence from conditional branches
5//! - Memory coalescing failures
6//! - Thermal effects on GPU clock frequency
7//! - L2 cache bank conflicts
8//!
9//! We dispatch a Metal compute shader where threads race to atomically
10//! increment a counter. The execution order captures GPU scheduling
11//! nondeterminism that is genuinely novel as an entropy source.
12//!
13//! Uses direct Metal framework FFI via Objective-C runtime — no external
14//! process spawning. Each dispatch completes in microseconds.
15//!
16
17use crate::source::{EntropySource, Platform, Requirement, SourceCategory, SourceInfo};
18#[cfg(target_os = "macos")]
19use crate::sources::helpers::extract_timing_entropy;
20#[cfg(target_os = "macos")]
21use crate::sources::helpers::mach_time;
22#[cfg(target_os = "macos")]
23use crate::sources::helpers::xor_fold_u64;
24#[cfg(target_os = "macos")]
25use std::sync::OnceLock;
26
27static GPU_DIVERGENCE_INFO: SourceInfo = SourceInfo {
28    name: "gpu_divergence",
29    description: "GPU shader thread execution order divergence entropy",
30    physics: "Dispatches Metal compute shaders where parallel threads race to atomically \
31              increment a shared counter. The execution order captures GPU scheduling \
32              nondeterminism from: SIMD group divergence on conditional branches, memory \
33              coalescing failures, L2 cache bank conflicts, thermal-dependent GPU clock \
34              frequency variation, and warp scheduler arbitration. Each dispatch produces \
35              a different execution ordering due to physical nondeterminism in the GPU.",
36    category: SourceCategory::GPU,
37    platform: Platform::MacOS,
38    requirements: &[Requirement::Metal],
39    entropy_rate_estimate: 4.0,
40    composite: false,
41    is_fast: true,
42};
43
44/// Entropy source that harvests thread execution order divergence from Metal GPU.
45pub struct GPUDivergenceSource;
46
47/// Metal framework FFI via Objective-C runtime (macOS only).
48#[cfg(target_os = "macos")]
49mod metal {
50    use std::ffi::{CString, c_void};
51
52    // Objective-C runtime types.
53    type Id = *mut c_void;
54    type Sel = *mut c_void;
55    type Class = *mut c_void;
56
57    #[link(name = "objc", kind = "dylib")]
58    unsafe extern "C" {
59        fn objc_getClass(name: *const i8) -> Class;
60        fn sel_registerName(name: *const i8) -> Sel;
61        fn objc_msgSend(receiver: Id, sel: Sel, ...) -> Id;
62    }
63
64    // Metal framework link — ensures the framework is loaded.
65    #[link(name = "Metal", kind = "framework")]
66    unsafe extern "C" {
67        fn MTLCreateSystemDefaultDevice() -> Id;
68    }
69
70    /// Number of GPU threads per dispatch.
71    const THREADS: u32 = 256;
72
73    /// Metal shader source: threads race to atomically increment a counter.
74    /// The `order` output captures the nondeterministic execution ordering.
75    const SHADER_SOURCE: &str = r#"
76#include <metal_stdlib>
77using namespace metal;
78kernel void divergence(
79    device atomic_uint *counter [[buffer(0)]],
80    device uint *output [[buffer(1)]],
81    uint tid [[thread_position_in_grid]]
82) {
83    // Data-dependent work to create divergence.
84    uint val = tid;
85    for (uint i = 0; i < 16; i++) {
86        if (val & 1) { val = val * 3 + 1; }
87        else { val = val >> 1; }
88    }
89    // Atomic increment — order captures scheduling nondeterminism.
90    uint order = atomic_fetch_add_explicit(counter, 1, memory_order_relaxed);
91    output[tid] = order ^ val;
92}
93"#;
94
95    /// Opaque handle to a reusable Metal pipeline + buffers.
96    pub struct MetalState {
97        _device: Id,
98        queue: Id,
99        pipeline: Id,
100        counter_buf: Id,
101        output_buf: Id,
102    }
103
104    // SAFETY: Metal objects are reference-counted and thread-safe.
105    // We only use them from a single thread within `collect()`.
106    unsafe impl Send for MetalState {}
107
108    /// Cast `objc_msgSend` to a concrete function pointer type.
109    ///
110    /// We must go through a raw pointer because `objc_msgSend` is a variadic
111    /// extern fn which is a zero-sized type that cannot be transmuted directly.
112    macro_rules! msg_send_fn {
113        ($ty:ty) => {
114            std::mem::transmute::<*const (), $ty>(objc_msgSend as *const ())
115        };
116    }
117
118    impl MetalState {
119        /// Try to initialize Metal device, compile shader, create buffers.
120        pub fn new() -> Option<Self> {
121            unsafe {
122                // SAFETY: MTLCreateSystemDefaultDevice returns a retained Metal device
123                // object or null if no GPU is available.
124                let device = MTLCreateSystemDefaultDevice();
125                if device.is_null() {
126                    return None;
127                }
128
129                let queue = msg_send(device, "newCommandQueue");
130                if queue.is_null() {
131                    // Release device to avoid leak.
132                    msg_send_fn!(unsafe extern "C" fn(Id, Sel))(device, sel("release"));
133                    return None;
134                }
135
136                let pipeline = match compile_shader(device) {
137                    Some(p) => p,
138                    None => {
139                        msg_send_fn!(unsafe extern "C" fn(Id, Sel))(queue, sel("release"));
140                        msg_send_fn!(unsafe extern "C" fn(Id, Sel))(device, sel("release"));
141                        return None;
142                    }
143                };
144
145                let counter_buf = new_buffer(device, 4); // 1 x uint32
146                let output_buf = new_buffer(device, THREADS as u64 * 4); // THREADS x uint32
147                if counter_buf.is_null() || output_buf.is_null() {
148                    if !output_buf.is_null() {
149                        msg_send_fn!(unsafe extern "C" fn(Id, Sel))(output_buf, sel("release"));
150                    }
151                    if !counter_buf.is_null() {
152                        msg_send_fn!(unsafe extern "C" fn(Id, Sel))(counter_buf, sel("release"));
153                    }
154                    msg_send_fn!(unsafe extern "C" fn(Id, Sel))(pipeline, sel("release"));
155                    msg_send_fn!(unsafe extern "C" fn(Id, Sel))(queue, sel("release"));
156                    msg_send_fn!(unsafe extern "C" fn(Id, Sel))(device, sel("release"));
157                    return None;
158                }
159
160                Some(MetalState {
161                    _device: device,
162                    queue,
163                    pipeline,
164                    counter_buf,
165                    output_buf,
166                })
167            }
168        }
169
170        /// Dispatch one compute pass and return the output buffer contents.
171        pub fn dispatch(&self) -> Option<Vec<u32>> {
172            unsafe {
173                // Zero the counter.
174                // SAFETY: counter_buf is a shared MTLBuffer we created. `contents` returns
175                // a valid pointer to the buffer's CPU-accessible memory.
176                let counter_ptr = msg_send(self.counter_buf, "contents") as *mut u32;
177                if counter_ptr.is_null() {
178                    return None;
179                }
180                *counter_ptr = 0;
181
182                let cmd_buf = msg_send(self.queue, "commandBuffer");
183                if cmd_buf.is_null() {
184                    return None;
185                }
186
187                let encoder = msg_send(cmd_buf, "computeCommandEncoder");
188                if encoder.is_null() {
189                    return None;
190                }
191
192                // encoder.setComputePipelineState_(pipeline)
193                let sel_set_pipeline = sel("setComputePipelineState:");
194                msg_send_fn!(unsafe extern "C" fn(Id, Sel, Id))(
195                    encoder,
196                    sel_set_pipeline,
197                    self.pipeline,
198                );
199
200                // encoder.setBuffer_offset_atIndex_(counter_buf, 0, 0)
201                set_buffer(encoder, self.counter_buf, 0, 0);
202                // encoder.setBuffer_offset_atIndex_(output_buf, 0, 1)
203                set_buffer(encoder, self.output_buf, 0, 1);
204
205                dispatch_threads_1d(encoder, THREADS, THREADS.min(256));
206
207                // End encoding, commit, wait.
208                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(encoder, sel("endEncoding"));
209                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(cmd_buf, sel("commit"));
210                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(cmd_buf, sel("waitUntilCompleted"));
211
212                // Read output.
213                let output_ptr = msg_send(self.output_buf, "contents") as *const u32;
214                if output_ptr.is_null() {
215                    return None;
216                }
217                let mut result = vec![0u32; THREADS as usize];
218                std::ptr::copy_nonoverlapping(output_ptr, result.as_mut_ptr(), THREADS as usize);
219                Some(result)
220            }
221        }
222    }
223
224    // -----------------------------------------------------------------------
225    // Objective-C runtime helpers
226    // -----------------------------------------------------------------------
227
228    unsafe fn sel(name: &str) -> Sel {
229        let c_name = CString::new(name).expect("selector contains null byte");
230        unsafe { sel_registerName(c_name.as_ptr()) }
231    }
232
233    unsafe fn msg_send(obj: Id, sel_name: &str) -> Id {
234        unsafe {
235            let s = sel(sel_name);
236            msg_send_fn!(unsafe extern "C" fn(Id, Sel) -> Id)(obj, s)
237        }
238    }
239
240    /// Create an NSString from a Rust &str.
241    unsafe fn nsstring(s: &str) -> Id {
242        unsafe {
243            let cls = objc_getClass(c"NSString".as_ptr());
244            let sel_alloc = sel("alloc");
245            let sel_init = sel("initWithBytes:length:encoding:");
246
247            let raw = msg_send_fn!(unsafe extern "C" fn(Id, Sel) -> Id)(cls as Id, sel_alloc);
248            // NSUTF8StringEncoding = 4
249            msg_send_fn!(unsafe extern "C" fn(Id, Sel, *const u8, u64, u64) -> Id)(
250                raw,
251                sel_init,
252                s.as_ptr(),
253                s.len() as u64,
254                4,
255            )
256        }
257    }
258
259    /// Release an Objective-C object (sends `release` message).
260    unsafe fn objc_release(obj: Id) {
261        if !obj.is_null() {
262            unsafe {
263                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(obj, sel("release"));
264            }
265        }
266    }
267
268    /// Compile the Metal shader source and return a compute pipeline state.
269    /// Releases all intermediate ObjC objects (NSStrings, library, function).
270    unsafe fn compile_shader(device: Id) -> Option<Id> {
271        unsafe {
272            let source = nsstring(SHADER_SOURCE);
273            if source.is_null() {
274                return None;
275            }
276
277            // device.newLibraryWithSource:options:error:
278            let sel_lib = sel("newLibraryWithSource:options:error:");
279            let mut error: Id = std::ptr::null_mut();
280            let library = msg_send_fn!(unsafe extern "C" fn(Id, Sel, Id, Id, *mut Id) -> Id)(
281                device,
282                sel_lib,
283                source,
284                std::ptr::null_mut(), // default options
285                &mut error,
286            );
287            if library.is_null() {
288                objc_release(error);
289                objc_release(source);
290                return None;
291            }
292            // source NSString no longer needed
293            objc_release(source);
294
295            // library.newFunctionWithName:("divergence")
296            let func_name = nsstring("divergence");
297            let sel_func = sel("newFunctionWithName:");
298            let function =
299                msg_send_fn!(unsafe extern "C" fn(Id, Sel, Id) -> Id)(library, sel_func, func_name);
300            objc_release(func_name);
301            if function.is_null() {
302                objc_release(library);
303                return None;
304            }
305
306            // device.newComputePipelineStateWithFunction:error:
307            let sel_pipe = sel("newComputePipelineStateWithFunction:error:");
308            let mut error2: Id = std::ptr::null_mut();
309            let pipeline = msg_send_fn!(unsafe extern "C" fn(Id, Sel, Id, *mut Id) -> Id)(
310                device,
311                sel_pipe,
312                function,
313                &mut error2,
314            );
315
316            // Release intermediates (pipeline retains what it needs)
317            objc_release(function);
318            objc_release(library);
319
320            if pipeline.is_null() {
321                objc_release(error2);
322                return None;
323            }
324
325            Some(pipeline)
326        }
327    }
328
329    /// Create a shared MTLBuffer of given size.
330    unsafe fn new_buffer(device: Id, size: u64) -> Id {
331        unsafe {
332            let sel_buf = sel("newBufferWithLength:options:");
333            // MTLResourceStorageModeShared = 0
334            msg_send_fn!(unsafe extern "C" fn(Id, Sel, u64, u64) -> Id)(device, sel_buf, size, 0)
335        }
336    }
337
338    /// Set a buffer on a compute command encoder.
339    unsafe fn set_buffer(encoder: Id, buffer: Id, offset: u64, index: u64) {
340        unsafe {
341            let s = sel("setBuffer:offset:atIndex:");
342            msg_send_fn!(unsafe extern "C" fn(Id, Sel, Id, u64, u64))(
343                encoder, s, buffer, offset, index,
344            );
345        }
346    }
347
348    /// Dispatch 1D threads on a compute command encoder.
349    unsafe fn dispatch_threads_1d(encoder: Id, total: u32, per_group: u32) {
350        // MTLSize is a struct of 3 x NSUInteger (u64 on 64-bit).
351        #[repr(C)]
352        struct MTLSize {
353            width: u64,
354            height: u64,
355            depth: u64,
356        }
357
358        let grid = MTLSize {
359            width: total as u64,
360            height: 1,
361            depth: 1,
362        };
363        let group = MTLSize {
364            width: per_group as u64,
365            height: 1,
366            depth: 1,
367        };
368
369        unsafe {
370            let s = sel("dispatchThreads:threadsPerThreadgroup:");
371            msg_send_fn!(unsafe extern "C" fn(Id, Sel, MTLSize, MTLSize))(encoder, s, grid, group);
372        }
373    }
374
375    impl Drop for MetalState {
376        fn drop(&mut self) {
377            // Release all retained Objective-C objects to prevent leaks.
378            unsafe {
379                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(self.output_buf, sel("release"));
380                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(self.counter_buf, sel("release"));
381                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(self.pipeline, sel("release"));
382                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(self.queue, sel("release"));
383                msg_send_fn!(unsafe extern "C" fn(Id, Sel))(self._device, sel("release"));
384            }
385        }
386    }
387}
388
389impl EntropySource for GPUDivergenceSource {
390    fn info(&self) -> &SourceInfo {
391        &GPU_DIVERGENCE_INFO
392    }
393
394    fn is_available(&self) -> bool {
395        #[cfg(target_os = "macos")]
396        {
397            static METAL_AVAILABLE: OnceLock<bool> = OnceLock::new();
398            *METAL_AVAILABLE.get_or_init(|| metal::MetalState::new().is_some())
399        }
400        #[cfg(not(target_os = "macos"))]
401        {
402            false
403        }
404    }
405
406    fn collect(&self, n_samples: usize) -> Vec<u8> {
407        #[cfg(not(target_os = "macos"))]
408        {
409            let _ = n_samples;
410            Vec::new()
411        }
412
413        #[cfg(target_os = "macos")]
414        {
415            let state = match metal::MetalState::new() {
416                Some(s) => s,
417                None => return Vec::new(),
418            };
419
420            let raw_count = n_samples * 2 + 64;
421            let mut timings: Vec<u64> = Vec::with_capacity(raw_count);
422            let mut gpu_entropy: Vec<u8> = Vec::with_capacity(raw_count);
423
424            for _ in 0..raw_count {
425                let t0 = mach_time();
426
427                // GPU dispatch crosses CPU→GPU→CPU clock domains.
428                let results = match state.dispatch() {
429                    Some(r) => r,
430                    None => continue,
431                };
432
433                let t1 = mach_time();
434                timings.push(t1.wrapping_sub(t0));
435
436                // XOR-fold all thread execution orders into one byte.
437                // This captures GPU scheduling nondeterminism directly.
438                let mut gpu_hash: u64 = 0;
439                for (i, &val) in results.iter().enumerate() {
440                    gpu_hash ^= (val as u64).rotate_left((i as u32) & 63);
441                }
442                gpu_entropy.push(xor_fold_u64(gpu_hash));
443            }
444
445            // Extract timing entropy from dispatch latencies.
446            let timing_bytes = extract_timing_entropy(&timings, n_samples);
447
448            // XOR GPU execution order entropy with dispatch timing entropy.
449            // Both are genuine, independent entropy sources.
450            let mut output: Vec<u8> = Vec::with_capacity(n_samples);
451            for i in 0..n_samples.min(timing_bytes.len()).min(gpu_entropy.len()) {
452                output.push(timing_bytes[i] ^ gpu_entropy[i]);
453            }
454
455            output.truncate(n_samples);
456            output
457        }
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464
465    #[test]
466    fn info() {
467        let src = GPUDivergenceSource;
468        assert_eq!(src.name(), "gpu_divergence");
469        assert_eq!(src.info().category, SourceCategory::GPU);
470        assert!(!src.info().composite);
471    }
472
473    #[test]
474    #[cfg(target_os = "macos")]
475    #[ignore] // Requires GPU
476    fn collects_bytes() {
477        let src = GPUDivergenceSource;
478        if src.is_available() {
479            let data = src.collect(64);
480            assert!(!data.is_empty());
481            assert!(data.len() <= 64);
482            let unique: std::collections::HashSet<u8> = data.iter().copied().collect();
483            assert!(unique.len() > 1, "Expected variation in collected bytes");
484        }
485    }
486}