ruvector_mincut_gated_transformer/
lib.rs

1//! # Mincut Gated Transformer
2//!
3//! Ultra low latency transformer inference designed for continuous systems.
4//! Governed by a coherence controller driven by dynamic minimum cut signals
5//! and optionally a spiking scheduler that skips work when nothing meaningful
6//! is happening.
7//!
8//! ## Academic Foundations
9//!
10//! This crate integrates multiple state-of-the-art optimization techniques:
11//!
12//! 1. **Mixture-of-Depths** (Raposo et al., 2024) - Dynamic compute allocation with 50% FLOPs reduction
13//! 2. **Early Exit** (Elhoushi et al., 2024) - Layer-skipping with 30-50% latency reduction
14//! 3. **Sparse Attention** (Jiang et al., 2024) - 90% attention FLOPs reduction for long contexts
15//! 4. **Energy-Based Transformers** (Gladstone et al., 2025) - Principled compute-quality tradeoffs
16//! 5. **Spike-Driven Inference** (Yao et al., 2023, 2024) - 87× energy reduction via event-driven compute
17//! 6. **Spectral Methods** (Kreuzer et al., 2021) - Graph-based coherence via spectral partitioning
18//!
19//! See `docs/THEORY.md` for detailed academic references and theoretical analysis.
20//!
21//! ## Primary Outcomes
22//!
23//! 1. **Deterministic, bounded inference** - Same inputs yield same outputs
24//! 2. **Allocation-free hot path** - Zero heap allocations after initialization
25//! 3. **Predictable tail latency** - Bounded p99 latency guarantees
26//! 4. **Explainable interventions** - Every gate decision produces a witness
27//! 5. **Easy integration** - Works with RuVector, ruvector-mincut, and agent orchestration
28//!
29//! ## Core Concepts
30//!
31//! The system has three roles:
32//!
33//! 1. **Transformer Kernel** - Produces logits or scores under fixed compute budgets
34//! 2. **Spike Scheduler** (optional) - Decides whether to run and selects compute tier
35//! 3. **Mincut Gate** (authoritative) - Decides what state changes are allowed
36//!
37//! ## Example
38//!
39//! ```rust,no_run
40//! use ruvector_mincut_gated_transformer::{
41//!     MincutGatedTransformer, TransformerConfig, GatePolicy,
42//!     GatePacket, InferInput, InferOutput,
43//! };
44//!
45//! // Create configuration
46//! let config = TransformerConfig::micro();
47//! let policy = GatePolicy::default();
48//!
49//! // Load weights (pseudo-code)
50//! # let weights = ruvector_mincut_gated_transformer::QuantizedWeights::empty(&config);
51//!
52//! // Create transformer
53//! let mut transformer = MincutGatedTransformer::new(config, policy, weights).unwrap();
54//!
55//! // Create gate packet from mincut signals
56//! let gate = GatePacket {
57//!     lambda: 100,
58//!     lambda_prev: 95,
59//!     boundary_edges: 5,
60//!     boundary_concentration_q15: 8192,
61//!     partition_count: 3,
62//!     flags: 0,
63//! };
64//!
65//! // Prepare input
66//! let input = InferInput {
67//!     tokens: Some(&[1, 2, 3, 4]),
68//!     embedding_q: None,
69//!     embedding_scale: 1.0,
70//!     input_signature: None,
71//!     gate,
72//!     spikes: None,
73//! };
74//!
75//! // Allocate output buffer
76//! let mut logits = vec![0i32; 1024];
77//! let mut output = InferOutput::new(&mut logits);
78//!
79//! // Run inference
80//! transformer.infer(&input, &mut output).unwrap();
81//!
82//! // Check witness for allowed actions
83//! if output.witness.external_writes_enabled == 1 {
84//!     // Safe to persist memory
85//! }
86//! ```
87
88#![cfg_attr(feature = "no_std_gateway", no_std)]
89
90#[cfg(feature = "no_std_gateway")]
91extern crate alloc;
92
93pub mod arena;
94pub mod attention;
95pub mod config;
96pub mod early_exit;
97pub mod error;
98pub mod ffn;
99pub mod flash_attention;
100pub mod gate;
101pub mod kernel;
102pub mod kv_cache;
103pub mod mamba;
104pub mod mod_routing;
105pub mod model;
106pub mod packets;
107pub mod q15;
108pub mod rope;
109pub mod speculative;
110pub mod spike;
111pub mod state;
112
113#[cfg(feature = "trace")]
114pub mod trace;
115
116#[cfg(feature = "spectral_pe")]
117pub mod spectral;
118
119#[cfg(feature = "sparse_attention")]
120pub mod sparse_attention;
121
122#[cfg(feature = "energy_gate")]
123pub mod energy_gate;
124
125// Re-exports for convenient access
126pub use arena::{calculate_arena_size, LayerWeights, WeightArena, WeightRef};
127pub use config::{GatePolicy, TransformerConfig};
128pub use early_exit::{CoherenceEarlyExit, EarlyExitConfig, EarlyExitDecision, ExitReason};
129pub use error::{Error, Result};
130pub use flash_attention::{
131    flash_attention_forward, flash_attention_forward_i8, flash_mha, FlashAttentionConfig,
132};
133pub use gate::{GateController, TierDecision};
134pub use kv_cache::{HadamardTransform, QuantBits, QuantizedKVCache};
135pub use mamba::{MambaConfig, MambaLayer, MambaState, MambaWeights};
136pub use mod_routing::{MincutDepthRouter, ModRoutingConfig, RoutingStats, TokenRoute};
137pub use model::{MincutGatedTransformer, QuantizedWeights, WeightsLoader};
138pub use packets::{
139    GateDecision, GatePacket, GateReason, InferInput, InferOutput, InferStats, SpikePacket, Witness,
140};
141pub use q15::{
142    f32_to_q15_batch, q15_batch_add, q15_batch_lerp, q15_batch_mul, q15_dot, q15_to_f32_batch, Q15,
143};
144pub use rope::{RopeConfig, RopeEmbedding, RopeScaling};
145pub use speculative::{
146    generate_tree_attention_mask, DraftToken, DraftTree, SpeculativeConfig, SpeculativeDecoder,
147    VerificationResult,
148};
149pub use spike::SpikeScheduler;
150pub use state::RuntimeState;
151
152#[cfg(feature = "trace")]
153pub use trace::{TraceCounters, TraceSnapshot, TraceState};
154
155#[cfg(feature = "spike_attention")]
156pub use attention::spike_driven::{SpikeDrivenAttention, SpikeDrivenConfig, SpikeTrain};
157
158#[cfg(feature = "spectral_pe")]
159pub use spectral::{
160    lanczos_sparse, power_iteration_sparse, SparseCSR, SpectralPEConfig, SpectralPositionEncoder,
161};
162
163#[cfg(feature = "sparse_attention")]
164pub use sparse_attention::{
165    LambdaDensitySchedule, MincutSparseAttention, SparseMask, SparsityConfig,
166};
167
168#[cfg(feature = "energy_gate")]
169pub use energy_gate::{EnergyGate, EnergyGateConfig, EnergyGradient};
170
171/// Crate version
172pub const VERSION: &str = env!("CARGO_PKG_VERSION");
173
174/// Prelude module for convenient imports
175pub mod prelude {
176    pub use crate::{
177        generate_tree_attention_mask, CoherenceEarlyExit, DraftToken, DraftTree, EarlyExitConfig,
178        EarlyExitDecision, Error, ExitReason, GateDecision, GatePacket, GatePolicy, GateReason,
179        HadamardTransform, InferInput, InferOutput, InferStats, MambaConfig, MambaLayer,
180        MambaState, MambaWeights, MincutDepthRouter, MincutGatedTransformer, ModRoutingConfig,
181        QuantBits, QuantizedKVCache, QuantizedWeights, Result, RopeConfig, RopeEmbedding,
182        RopeScaling, RoutingStats, SpeculativeConfig, SpeculativeDecoder, SpikePacket, TokenRoute,
183        TransformerConfig, VerificationResult, WeightsLoader, Witness,
184    };
185
186    #[cfg(feature = "trace")]
187    pub use crate::{TraceCounters, TraceSnapshot};
188}
189
190/// Supported model configurations
191pub mod configs {
192    use super::TransformerConfig;
193
194    /// Baseline CPU configuration
195    /// - Sequence length: 64
196    /// - Hidden size: 256
197    /// - Heads: 4
198    /// - Layers: 4
199    pub fn baseline() -> TransformerConfig {
200        TransformerConfig::baseline()
201    }
202
203    /// Micro configuration for WASM and edge gateways
204    /// - Sequence length: 32
205    /// - Hidden size: 128
206    /// - Heads: 4
207    /// - Layers: 2
208    pub fn micro() -> TransformerConfig {
209        TransformerConfig::micro()
210    }
211}