vyre 0.4.1

GPU compute intermediate representation with a standard operation library
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
#![forbid(unsafe_code)]
#![warn(missing_docs)]
// Every lint below is allowed for a documented reason. New lints from
// nursery/pedantic/restriction are NOT auto-allowed — broad blanket allows
// were removed deliberately so that future clippy findings surface as CI
// warnings instead of being silently swallowed.
#![allow(
    // Auto-generated op wrappers replay derive attributes by design.
    clippy::duplicated_attributes,
    // GPU buffer layout types (bind-group slot tuples) are inherently complex.
    clippy::type_complexity,
    // Shader-side math and wire-format POD structs do intentional integer
    // casts; the conform gate verifies byte-identity with the CPU reference.
    clippy::cast_lossless,
    clippy::cast_possible_truncation,
    clippy::cast_possible_wrap,
    clippy::cast_precision_loss,
    clippy::cast_sign_loss,
    // Explicit clones on Copy improve readability in serial layers where
    // semantic ownership matters more than cycle count.
    clippy::clone_on_copy,
    // Three-branch comparisons are natural in range-check oracles.
    clippy::comparison_chain,
    // Vyre uses explicit invariant violations (expect/unwrap) with `Fix:`
    // prosenot graceful degradationper the engineering standard.
    clippy::expect_used,
    // Generic collections take external hashers by design.
    clippy::implicit_hasher,
    // SHA/hash compressors use the canonical single-letter state vars
    // (a,b,c,d,e,f,g,h per FIPS 180-4).
    clippy::many_single_char_names,
    // Error prose is centralized in the `Error` enum; per-fn `# Errors`
    // sections duplicate that contract.
    clippy::missing_errors_doc,
    // Panics document invariant violations with `Fix:` prose inline.
    clippy::missing_panics_doc,
    // Template-generated ops don't always merit `#[must_use]`.
    clippy::must_use_candidate,
    // Builder APIs take owned values by design.
    clippy::needless_pass_by_value,
    // Indexed arithmetic is clearer than iterator chains for GPU-shape loops.
    clippy::needless_range_loop,
    // Generated target-text strings use `r##` for quote safety.
    clippy::needless_raw_string_hashes,
    // Type names repeat module names for cross-crate discoverability.
    clippy::module_name_repetitions,
    // `mod X` in `X.rs` is the canonical vyre module layout.
    clippy::module_inception,
    // Math code uses short similar names (a/A, x/X) by convention.
    clippy::similar_names,
    // Internal helpers with stdlib-adjacent names are intentional for clarity.
    clippy::should_implement_trait,
    // Enforcer dispatch arms can share a body but represent distinct cases.
    clippy::match_same_arms,
    // Hot paths in the pipeline assemble strings incrementally.
    clippy::format_push_string,
    // GPU kernel dispatchers take many parameters by design (buffer slots).
    clippy::too_many_arguments,
    // Hash compressors and regex compilers have long inlined bodies.
    clippy::too_many_lines,
    // Trait signatures force `&T` for small Copy types.
    clippy::trivially_copy_pass_by_ref,
    // `Result<T, E>` with a single error variant keeps the API
    // forward-compatible as new error variants land.
    clippy::unnecessary_wraps,
    // Or-patterns are expanded for readability in large match tables.
    clippy::unnested_or_patterns,
    // GPU buffer sizes like `0x12345678` are more readable without `_`
    // separators in shader contexts.
    clippy::unreadable_literal,
    // Prose doc comments use type names that clippy wants backticked; our
    // doc style sentences already read naturally.
    clippy::doc_markdown
)]
#![cfg_attr(not(test), deny(clippy::todo, clippy::unimplemented))]
//! # vyre — LLVM-for-GPU
//!
//! Vyre is a GPU compute substrate centered on the `Program` type. Just as
//! LLVM lets frontends emit a single IR that lowers to many CPU backends,
//! vyre lets frontends emit a single `Program` that lowers through any
//! registered backend or the pure-Rust reference interpreter. The crate root
//! re-exports the frozen public API: the `Program` type, the `VyreBackend`
//! trait, and the standard operation library.
//!
//! Frontends, backends, and conformance tools depend only on the stable
//! types exported here. Changing the target-text lowering path never breaks a
//! frontend; changing a frontend AST never affects backend dispatch logic.
//! This module is the single source of truth for the vyre public API.

/// The vyre Program model.
///
/// This module defines `Program`, the frozen, serializable model that every
/// frontend emits and every backend consumes. It has zero external
/// dependencies so that spec tools can parse it without pulling in GPU
/// libraries.
/// Public API re-export.
pub use vyre_foundation::ir;

// Layer 1 and Layer 2 operation specifications live in vyre-libs.
// The crate root remains the single stable import surface for consumers.

/// Program lowering to the substrate-neutral kernel descriptor.
///
/// Lowering transforms a validated `Program` into
/// [`lower::KernelDescriptor`]. Emit crates then turn that descriptor into
/// target artifacts. Frontends do not depend on this module; it is consumed
/// by backend and emitter implementations.
/// Public API re-export.
pub mod lower {
    /// Canonical Program -> KernelDescriptor lowering entry point.
    pub use vyre_lower::lower::lower;
    pub use vyre_lower::*;
}

/// IR-to-IR optimizer pass framework.
///
/// `optimizer` provides the registered pass scheduler and reference
/// optimization passes used by frontends that want fixpoint IR cleanup before
/// lowering.
/// Public API re-export.
pub use vyre_foundation::optimizer;

/// Wire-format CPU-reference byte ABI contract.
/// Public API re-export.
pub use vyre_foundation::cpu_op;
/// CPU reference implementations shared across backends.
/// Public API re-export.
pub use vyre_foundation::cpu_references;
/// Substrate-neutral memory ordering model.
/// Public API re-export.
pub use vyre_foundation::memory_model;
/// Substrate-neutral memory ordering type.
/// Public API re-export.
pub use vyre_foundation::MemoryOrdering;

/// Distribution-aware runtime algorithm selection.
/// Public API re-export.
pub use vyre_driver::routing;

/// Substrate-neutral execution planning for performance and accuracy tracks.
/// Public API re-export.
pub use vyre_foundation::execution_plan;

/// Unified error types for the entire crate.
/// Public API re-export.
pub use vyre_driver::error;

/// Structured, machine-readable diagnostics.
/// Public API re-export.
pub use vyre_driver::diagnostics;

/// Backend trait surface — `VyreBackend`, `Executable`,
/// `Streamable`, `DispatchConfig`, `BackendError`,
/// `ErrorCode`. The whole backend contract every driver crate
/// implements against.
/// Public API re-export.
/// Public API re-export.
pub use vyre_driver::backend;
/// Re-export of the native scan match result type from the foundation crate.
/// Public API re-export.
/// Public API re-export.
pub use vyre_foundation::match_result;

/// Pipeline-mode dispatch: compile a Program once, dispatch repeatedly.
/// Public API re-export.
/// Public API re-export.
pub use vyre_driver::pipeline;

// Previously: pub mod bytecode — a 637-LOC stack-machine VM publicly
// re-exported from core. Deleted 2026-04-17. The NFA scan micro-interpreter
// that carried the remaining bytecode was deleted 2026-04-19. Rule evaluators
// compose ops in vyre IR directly. No interpreter surface remains in core.

pub use vyre_driver::{
    BackendError, BackendRegistration, CompiledPipeline, DispatchConfig, Error, Executable, Memory,
    MemoryRef, OutputBuffers, TypedDispatchExt, VyreBackend,
};

/// Persistent-thread dispatch policy for dispatch paths.
pub use vyre_driver::persistent::PersistentThreadMode;
/// Speculation policy for dispatch paths.
pub use vyre_driver::speculate::SpeculationMode;

/// Re-export of the core IR program type and validation entry point.
///
/// `Program` is the frozen IR container. `validate` is the function that
/// checks a program for structural and semantic correctness before it is
/// handed to a backend.
pub use ir::{validate, InterpCtx, NodeId, NodeStorage, OpId, Program, Value};

/// Re-export of the native scan match result type.
///
/// `Match` represents a byte-range hit produced by pattern-scanning engines.
pub use vyre_foundation::match_result::Match;

/// Domain-neutral byte-range type.
pub use vyre_foundation::ByteRange;

/// R2: single canonical pre-lowering optimize entry point.
///
/// Bundles the canonical pre-lowering pipeline so every consumer
/// (surgec, pyrograph, warpscan, in-tree benches) wires one function
/// instead of three. Today every consumer separately calls
/// `pre_lowering::optimize`, then `vyre_lower::lower`, then a
/// backend-specific emit. This wrapper keeps the optimization stage —
/// the part that's stable across backends — behind one symbol so
/// adding a new substrate row does not require N consumer changes.
///
/// The lowering and emit stages remain backend-specific and are
/// invoked separately by the chosen `VyreBackend`. This function
/// returns the optimized `Program` ready to hand to any backend's
/// `dispatch` / `compile` path.
///
/// **N9 substrate composition fingerprint cache.** Repeated identical
/// inputs (same `program.fingerprint()`) skip the substrate stack
/// entirely. The cache is process-local, bounded to
/// [`OPTIMIZE_CACHE_CAPACITY`] entries, and uses O(1) fingerprint lookup
/// with FIFO eviction — long-running daemons get the cache without
/// unbounded memory.
/// On a cache hit, `optimize` clones the cached `Program` instead of
/// re-running the (canonicalize + region_inline + scheduler fixpoint
/// + CSE + DCE + phase-4) pipeline. The substrate stack is purely
/// functional in `Program`, so caching by structural fingerprint is
/// safe — same input bytes, same output bytes.
///
/// # Example
///
/// ```no_run
/// use vyre::{optimize, Program};
/// fn run(program: Program) -> Program {
///     optimize(program)
/// }
/// ```
#[must_use]
pub fn optimize(program: Program) -> Program {
    let key = program.fingerprint();
    if let Some(cached) = optimize_cache::get(&key) {
        return cached;
    }
    let optimized = vyre_foundation::optimizer::pre_lowering::optimize(program);
    optimize_cache::put(key, &optimized);
    optimized
}

/// Device-aware public optimizer entry point.
///
/// Runs adapter-shaped workgroup autotuning from a neutral
/// [`DeviceProfile`] before the canonical pre-lowering optimization
/// pipeline. Consumers with a live backend should prefer
/// [`optimize_for_backend`]; consumers with a saved device signature
/// can call this directly.
#[must_use]
pub fn optimize_for_device(program: Program, profile: &vyre_driver::DeviceProfile) -> Program {
    let key = device_optimize_key(&program, profile);
    if let Some(cached) = optimize_cache::get_device(&key) {
        return cached;
    }
    let tuned =
        vyre_foundation::optimizer::passes::autotune::Autotune::transform_for_adapter(
            program,
            &profile.adapter_caps(),
        )
        .program;
    let optimized = optimize(tuned);
    optimize_cache::put_device(key, &optimized);
    optimized
}

/// Device-aware public optimizer entry point for a live backend.
#[must_use]
pub fn optimize_for_backend(program: Program, backend: &dyn vyre_driver::VyreBackend) -> Program {
    let profile = backend.device_profile();
    optimize_for_device(program, &profile)
}

fn device_optimize_key(program: &Program, profile: &vyre_driver::DeviceProfile) -> [u8; 32] {
    let mut hasher = blake3::Hasher::new();
    hasher.update(b"vyre-core-optimize-device-v1\0");
    hasher.update(&program.fingerprint());
    hasher.update(profile.backend.as_bytes());
    hasher.update(&[u8::from(profile.supports_subgroup_ops)]);
    hasher.update(&[u8::from(profile.supports_indirect_dispatch)]);
    hasher.update(&[u8::from(profile.supports_f16)]);
    hasher.update(&[u8::from(profile.supports_bf16)]);
    hasher.update(&[u8::from(profile.supports_tensor_cores)]);
    hasher.update(&profile.max_workgroup_size[0].to_le_bytes());
    hasher.update(&profile.max_workgroup_size[1].to_le_bytes());
    hasher.update(&profile.max_workgroup_size[2].to_le_bytes());
    hasher.update(&profile.max_invocations_per_workgroup.to_le_bytes());
    hasher.update(&profile.max_shared_memory_bytes.to_le_bytes());
    hasher.update(&profile.subgroup_size.to_le_bytes());
    hasher.update(&profile.compute_units.to_le_bytes());
    hasher.update(&profile.ideal_unroll_depth.to_le_bytes());
    hasher.update(&profile.ideal_vector_pack_bits.to_le_bytes());
    *hasher.finalize().as_bytes()
}

/// N9 cache capacity (entries). Sized to hold the working set of a
/// long-running scanner without unbounded growth — each entry is
/// roughly the size of one optimized `Program`. 256 entries is
/// `~10MB` worst-case for typical surgec-shaped Programs.
pub const OPTIMIZE_CACHE_CAPACITY: usize = 256;

/// Process-local fingerprint -> Program cache for [`optimize`].
mod optimize_cache {
    use super::Program;
    use super::OPTIMIZE_CACHE_CAPACITY;
    use std::collections::{HashMap, VecDeque};
    use std::sync::Mutex;

    struct Cache {
        entries: HashMap<[u8; 32], Program>,
        fifo: VecDeque<[u8; 32]>,
        device_entries: HashMap<[u8; 32], Program>,
        device_fifo: VecDeque<[u8; 32]>,
    }

    impl Cache {
        fn new() -> Self {
            Self {
                entries: HashMap::with_capacity(OPTIMIZE_CACHE_CAPACITY),
                fifo: VecDeque::with_capacity(OPTIMIZE_CACHE_CAPACITY),
                device_entries: HashMap::with_capacity(OPTIMIZE_CACHE_CAPACITY),
                device_fifo: VecDeque::with_capacity(OPTIMIZE_CACHE_CAPACITY),
            }
        }
    }

    fn cache() -> &'static Mutex<Cache> {
        use std::sync::OnceLock;
        static CACHE: OnceLock<Mutex<Cache>> = OnceLock::new();
        CACHE.get_or_init(|| Mutex::new(Cache::new()))
    }

    pub(super) fn get(key: &[u8; 32]) -> Option<Program> {
        let cache = cache().lock().ok()?;
        cache.entries.get(key).cloned()
    }

    pub(super) fn put(key: [u8; 32], program: &Program) {
        let Ok(mut cache) = cache().lock() else {
            return;
        };
        if cache.entries.contains_key(&key) {
            return;
        }
        if cache.entries.len() >= OPTIMIZE_CACHE_CAPACITY {
            if let Some(evicted) = cache.fifo.pop_front() {
                cache.entries.remove(&evicted);
            }
        }
        cache.fifo.push_back(key);
        cache.entries.insert(key, program.clone());
    }

    pub(super) fn get_device(key: &[u8; 32]) -> Option<Program> {
        let cache = cache().lock().ok()?;
        cache.device_entries.get(key).cloned()
    }

    pub(super) fn put_device(key: [u8; 32], program: &Program) {
        let Ok(mut cache) = cache().lock() else {
            return;
        };
        if cache.device_entries.contains_key(&key) {
            return;
        }
        if cache.device_entries.len() >= OPTIMIZE_CACHE_CAPACITY {
            if let Some(evicted) = cache.device_fifo.pop_front() {
                cache.device_entries.remove(&evicted);
            }
        }
        cache.device_fifo.push_back(key);
        cache.device_entries.insert(key, program.clone());
    }

    #[cfg(test)]
    pub(super) fn clear() {
        if let Ok(mut cache) = cache().lock() {
            cache.entries.clear();
            cache.fifo.clear();
            cache.device_entries.clear();
            cache.device_fifo.clear();
        }
    }

    #[cfg(test)]
    pub(super) fn len() -> usize {
        cache().lock().map(|c| c.entries.len()).unwrap_or(0)
    }

    #[cfg(test)]
    pub(super) fn len_device() -> usize {
        cache()
            .lock()
            .map(|c| c.device_entries.len())
            .unwrap_or(0)
    }
}

#[cfg(test)]
mod optimize_tests {
    use super::*;
    use std::sync::{Mutex, MutexGuard, OnceLock};
    use vyre_foundation::ir::{BufferDecl, DataType, Expr, Node};

    /// Serialise tests in this module so they don't race on the
    /// process-global cache. Each test takes the guard at entry and
    /// drops it at exit; the eviction test takes ~256 inserts which
    /// otherwise pollutes the count the other tests assert on.
    fn serial() -> MutexGuard<'static, ()> {
        static M: OnceLock<Mutex<()>> = OnceLock::new();
        M.get_or_init(|| Mutex::new(()))
            .lock()
            .unwrap_or_else(|e| e.into_inner())
    }

    fn sample_program() -> Program {
        Program::wrapped(
            vec![BufferDecl::output("out", 0, DataType::U32).with_count(1)],
            [1, 1, 1],
            vec![Node::store("out", Expr::u32(0), Expr::u32(42))],
        )
    }

    #[test]
    fn optimize_is_cached_by_fingerprint() {
        let _g = serial();
        optimize_cache::clear();
        let p1 = sample_program();
        let p2 = sample_program();
        let _ = optimize(p1);
        let before = optimize_cache::len();
        let _ = optimize(p2);
        let after = optimize_cache::len();
        assert_eq!(
            before, after,
            "second optimize on identical fingerprint must hit the cache"
        );
        assert_eq!(before, 1, "cache must contain exactly one entry");
    }

    #[test]
    fn optimize_returns_equivalent_program_on_cache_hit() {
        let _g = serial();
        optimize_cache::clear();
        let p = sample_program();
        let first = optimize(p.clone());
        let second = optimize(p);
        assert_eq!(
            first.fingerprint(),
            second.fingerprint(),
            "cache hit must return a Program with identical fingerprint"
        );
    }

    #[test]
    fn optimize_cache_evicts_at_capacity() {
        let _g = serial();
        optimize_cache::clear();
        // Build OPTIMIZE_CACHE_CAPACITY + 1 distinct programs by
        // varying the stored literal — each gets a unique fingerprint.
        for i in 0..(OPTIMIZE_CACHE_CAPACITY + 1) {
            let prog = Program::wrapped(
                vec![BufferDecl::output("out", 0, DataType::U32).with_count(1)],
                [1, 1, 1],
                vec![Node::store("out", Expr::u32(0), Expr::u32(i as u32))],
            );
            let _ = optimize(prog);
        }
        assert_eq!(
            optimize_cache::len(),
            OPTIMIZE_CACHE_CAPACITY,
            "cache must cap at OPTIMIZE_CACHE_CAPACITY entries"
        );
    }

    #[test]
    fn optimize_cache_deduplicates_entries_by_fingerprint() {
        let _g = serial();
        optimize_cache::clear();
        let p1 = sample_program();
        let p2 = sample_program();
        let _ = optimize(p1);
        let before = optimize_cache::len();
        let _ = optimize(p2);
        let after = optimize_cache::len();
        assert_eq!(
            before,
            after,
            "optimize must reuse cached result on identical fingerprints"
        );
    }

    #[test]
    fn optimize_for_device_uses_device_specific_cache() {
        let _g = serial();
        optimize_cache::clear();
        let mut profile = vyre_driver::DeviceProfile::conservative("test");
        profile.max_workgroup_size = [256, 1, 1];
        profile.max_invocations_per_workgroup = 256;
        let p1 = sample_program();
        let p2 = sample_program();
        let first = optimize_for_device(p1, &profile);
        let second = optimize_for_device(p2, &profile);
        assert_eq!(first.fingerprint(), second.fingerprint());
        assert_eq!(
            optimize_cache::len_device(),
            1,
            "same program+device profile must hit the device optimize cache"
        );
        assert_eq!(
            optimize_cache::len(),
            1,
            "device optimization should still reuse the canonical optimize cache after tuning"
        );
    }
}