rustkernel_core/
traits.rs

1//! Core kernel traits.
2//!
3//! This module defines the fundamental traits that all kernels implement:
4//! - `GpuKernel`: Base trait for all GPU kernels
5//! - `BatchKernel`: Trait for batch (CPU-orchestrated) kernels
6//! - `RingKernelHandler`: Trait for ring (persistent actor) kernels
7//! - `CheckpointableKernel`: Trait for kernels that support checkpoint/restore (0.3.1)
8//!
9//! ## Enterprise Features (0.3.1)
10//!
11//! - Health checking for liveness/readiness probes
12//! - Execution context with auth, tenant, and tracing
13//! - Secure message handling with authentication
14//! - Checkpoint/restore for recovery
15
16use crate::error::Result;
17use crate::kernel::KernelMetadata;
18use async_trait::async_trait;
19use ringkernel_core::{RingContext, RingMessage};
20use serde::{Deserialize, Serialize};
21use std::fmt::Debug;
22use std::time::Duration;
23use uuid::Uuid;
24
25// ============================================================================
26// Health & Status Types
27// ============================================================================
28
29/// Health status for kernel health checks
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
31pub enum HealthStatus {
32    /// Kernel is healthy and operational
33    #[default]
34    Healthy,
35    /// Kernel is degraded but still operational
36    Degraded,
37    /// Kernel is unhealthy and should not receive traffic
38    Unhealthy,
39    /// Health status is unknown (check failed)
40    Unknown,
41}
42
43impl std::fmt::Display for HealthStatus {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        match self {
46            Self::Healthy => write!(f, "healthy"),
47            Self::Degraded => write!(f, "degraded"),
48            Self::Unhealthy => write!(f, "unhealthy"),
49            Self::Unknown => write!(f, "unknown"),
50        }
51    }
52}
53
54// ============================================================================
55// Execution Context Types
56// ============================================================================
57
58/// Execution context for kernel invocations.
59///
60/// Provides authentication, tenant isolation, and distributed tracing context
61/// for kernel execution.
62#[derive(Debug, Clone, Default)]
63pub struct ExecutionContext {
64    /// Request ID for tracing
65    pub request_id: Option<Uuid>,
66    /// Trace ID for distributed tracing
67    pub trace_id: Option<String>,
68    /// Span ID for distributed tracing
69    pub span_id: Option<String>,
70    /// Authenticated user ID (if any)
71    pub user_id: Option<String>,
72    /// Tenant ID for multi-tenancy
73    pub tenant_id: Option<String>,
74    /// Request timeout (if specified)
75    pub timeout: Option<Duration>,
76    /// Additional metadata
77    pub metadata: std::collections::HashMap<String, String>,
78}
79
80impl ExecutionContext {
81    /// Create a new execution context
82    pub fn new() -> Self {
83        Self {
84            request_id: Some(Uuid::new_v4()),
85            ..Default::default()
86        }
87    }
88
89    /// Create context with request ID
90    pub fn with_request_id(mut self, id: Uuid) -> Self {
91        self.request_id = Some(id);
92        self
93    }
94
95    /// Set trace context
96    pub fn with_trace(mut self, trace_id: impl Into<String>, span_id: impl Into<String>) -> Self {
97        self.trace_id = Some(trace_id.into());
98        self.span_id = Some(span_id.into());
99        self
100    }
101
102    /// Set authenticated user
103    pub fn with_user(mut self, user_id: impl Into<String>) -> Self {
104        self.user_id = Some(user_id.into());
105        self
106    }
107
108    /// Set tenant
109    pub fn with_tenant(mut self, tenant_id: impl Into<String>) -> Self {
110        self.tenant_id = Some(tenant_id.into());
111        self
112    }
113
114    /// Set timeout
115    pub fn with_timeout(mut self, timeout: Duration) -> Self {
116        self.timeout = Some(timeout);
117        self
118    }
119
120    /// Add metadata
121    pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
122        self.metadata.insert(key.into(), value.into());
123        self
124    }
125}
126
127/// Secure ring context with authentication.
128///
129/// Wraps `RingContext` with security context for authenticated message handling.
130pub struct SecureRingContext<'ctx, 'ring> {
131    /// The underlying ring context
132    pub ring_ctx: &'ctx mut RingContext<'ring>,
133    /// Execution context with auth info
134    pub exec_ctx: &'ctx ExecutionContext,
135}
136
137impl<'ctx, 'ring> SecureRingContext<'ctx, 'ring> {
138    /// Create a new secure context
139    pub fn new(ring_ctx: &'ctx mut RingContext<'ring>, exec_ctx: &'ctx ExecutionContext) -> Self {
140        Self { ring_ctx, exec_ctx }
141    }
142
143    /// Get the authenticated user ID
144    pub fn user_id(&self) -> Option<&str> {
145        self.exec_ctx.user_id.as_deref()
146    }
147
148    /// Get the tenant ID
149    pub fn tenant_id(&self) -> Option<&str> {
150        self.exec_ctx.tenant_id.as_deref()
151    }
152
153    /// Check if request is authenticated
154    pub fn is_authenticated(&self) -> bool {
155        self.exec_ctx.user_id.is_some()
156    }
157}
158
159// ============================================================================
160// Kernel Configuration
161// ============================================================================
162
163/// Runtime configuration for a kernel instance.
164#[derive(Debug, Clone, Default, Serialize, Deserialize)]
165pub struct KernelConfig {
166    /// Maximum queue depth
167    pub max_queue_depth: Option<usize>,
168    /// Execution timeout
169    pub timeout: Option<Duration>,
170    /// Enable tracing
171    pub tracing_enabled: bool,
172    /// Enable metrics collection
173    pub metrics_enabled: bool,
174    /// Custom configuration values
175    pub custom: std::collections::HashMap<String, serde_json::Value>,
176}
177
178impl KernelConfig {
179    /// Create a new kernel config
180    pub fn new() -> Self {
181        Self::default()
182    }
183
184    /// Set queue depth
185    pub fn with_queue_depth(mut self, depth: usize) -> Self {
186        self.max_queue_depth = Some(depth);
187        self
188    }
189
190    /// Set timeout
191    pub fn with_timeout(mut self, timeout: Duration) -> Self {
192        self.timeout = Some(timeout);
193        self
194    }
195
196    /// Enable tracing
197    pub fn with_tracing(mut self, enabled: bool) -> Self {
198        self.tracing_enabled = enabled;
199        self
200    }
201
202    /// Enable metrics
203    pub fn with_metrics(mut self, enabled: bool) -> Self {
204        self.metrics_enabled = enabled;
205        self
206    }
207
208    /// Set custom value
209    pub fn with_custom(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
210        self.custom.insert(key.into(), value);
211        self
212    }
213}
214
215// ============================================================================
216// Core Kernel Traits
217// ============================================================================
218
219/// Base trait for all GPU kernels.
220///
221/// Provides access to kernel metadata, health checking, and lifecycle management.
222///
223/// ## Enterprise Features (0.3.1)
224///
225/// - `health_check()` - Report kernel health for liveness/readiness probes
226/// - `shutdown()` - Graceful shutdown with resource cleanup
227/// - `refresh_config()` - Hot configuration reload
228pub trait GpuKernel: Send + Sync + Debug {
229    /// Returns the kernel metadata.
230    fn metadata(&self) -> &KernelMetadata;
231
232    /// Validate kernel configuration.
233    ///
234    /// Called before kernel launch to ensure configuration is valid.
235    fn validate(&self) -> Result<()> {
236        Ok(())
237    }
238
239    /// Returns the kernel ID.
240    fn id(&self) -> &str {
241        &self.metadata().id
242    }
243
244    /// Returns true if this kernel requires GPU-native execution.
245    fn requires_gpu_native(&self) -> bool {
246        self.metadata().requires_gpu_native
247    }
248
249    // ========================================================================
250    // Enterprise Features (0.3.1)
251    // ========================================================================
252
253    /// Perform a health check on this kernel.
254    ///
255    /// Used by liveness and readiness probes. Override to implement
256    /// custom health checking logic (e.g., checking GPU memory, connections).
257    ///
258    /// # Returns
259    ///
260    /// The current health status of the kernel.
261    fn health_check(&self) -> HealthStatus {
262        HealthStatus::Healthy
263    }
264
265    /// Graceful shutdown of the kernel.
266    ///
267    /// Called during runtime shutdown to release resources. Override to
268    /// implement custom cleanup (e.g., flushing buffers, closing connections).
269    ///
270    /// Default implementation does nothing.
271    fn shutdown(&self) -> Result<()> {
272        Ok(())
273    }
274
275    /// Refresh kernel configuration at runtime.
276    ///
277    /// Called when configuration is hot-reloaded. Only safe-to-reload
278    /// configuration values should be applied.
279    ///
280    /// # Arguments
281    ///
282    /// * `config` - The new configuration to apply
283    ///
284    /// # Returns
285    ///
286    /// Ok if configuration was applied, Err if configuration is invalid.
287    fn refresh_config(&mut self, _config: &KernelConfig) -> Result<()> {
288        Ok(())
289    }
290}
291
292/// Trait for batch (CPU-orchestrated) kernels.
293///
294/// Batch kernels are launched on-demand with CPU orchestration.
295/// They have 10-50μs launch overhead and state resides in CPU memory.
296///
297/// ## Enterprise Features (0.3.1)
298///
299/// - `execute_with_context()` - Execute with auth, tenant, and tracing context
300/// - `execute_with_timeout()` - Execute with deadline enforcement
301///
302/// # Type Parameters
303///
304/// - `I`: Input type
305/// - `O`: Output type
306#[async_trait]
307pub trait BatchKernel<I, O>: GpuKernel
308where
309    I: Send + Sync,
310    O: Send + Sync,
311{
312    /// Execute the kernel with the given input.
313    ///
314    /// # Arguments
315    ///
316    /// * `input` - The input data for the kernel
317    ///
318    /// # Returns
319    ///
320    /// The kernel output or an error.
321    async fn execute(&self, input: I) -> Result<O>;
322
323    /// Validate the input before execution.
324    ///
325    /// Override to provide custom input validation.
326    fn validate_input(&self, _input: &I) -> Result<()> {
327        Ok(())
328    }
329
330    // ========================================================================
331    // Enterprise Features (0.3.1)
332    // ========================================================================
333
334    /// Execute the kernel with execution context.
335    ///
336    /// Provides authentication, tenant isolation, and distributed tracing
337    /// context for the kernel execution.
338    ///
339    /// # Arguments
340    ///
341    /// * `ctx` - The execution context with auth, tenant, and tracing info
342    /// * `input` - The input data for the kernel
343    ///
344    /// # Returns
345    ///
346    /// The kernel output or an error.
347    ///
348    /// # Default Implementation
349    ///
350    /// Delegates to `execute()` ignoring the context. Override to use context.
351    async fn execute_with_context(&self, ctx: &ExecutionContext, input: I) -> Result<O>
352    where
353        I: 'async_trait,
354    {
355        // Default: ignore context, just execute
356        let _ = ctx;
357        self.execute(input).await
358    }
359
360    /// Execute the kernel with a timeout.
361    ///
362    /// # Arguments
363    ///
364    /// * `input` - The input data for the kernel
365    /// * `timeout` - Maximum execution time
366    ///
367    /// # Returns
368    ///
369    /// The kernel output or a timeout error.
370    async fn execute_with_timeout(&self, input: I, timeout: Duration) -> Result<O>
371    where
372        I: 'async_trait,
373    {
374        match tokio::time::timeout(timeout, self.execute(input)).await {
375            Ok(result) => result,
376            Err(_elapsed) => Err(crate::error::KernelError::Timeout(timeout)),
377        }
378    }
379}
380
381/// Trait for ring (persistent actor) kernels.
382///
383/// Ring kernels are persistent GPU actors with 100-500ns message latency.
384/// State resides permanently in GPU memory.
385///
386/// ## Enterprise Features (0.3.1)
387///
388/// - `handle_secure()` - Handle messages with security context
389///
390/// # Type Parameters
391///
392/// - `M`: Request message type
393/// - `R`: Response message type
394#[async_trait]
395pub trait RingKernelHandler<M, R>: GpuKernel
396where
397    M: RingMessage + Send + Sync,
398    R: RingMessage + Send + Sync,
399{
400    /// Handle an incoming message.
401    ///
402    /// # Arguments
403    ///
404    /// * `ctx` - The ring kernel context with GPU intrinsics
405    /// * `msg` - The incoming message
406    ///
407    /// # Returns
408    ///
409    /// The response message or an error.
410    async fn handle(&self, ctx: &mut RingContext, msg: M) -> Result<R>;
411
412    /// Initialize the kernel state.
413    ///
414    /// Called once when the kernel is first activated.
415    async fn initialize(&self, _ctx: &mut RingContext) -> Result<()> {
416        Ok(())
417    }
418
419    /// Called when the kernel is being shut down.
420    ///
421    /// Use this to clean up resources.
422    async fn ring_shutdown(&self, _ctx: &mut RingContext) -> Result<()> {
423        Ok(())
424    }
425
426    // ========================================================================
427    // Enterprise Features (0.3.1)
428    // ========================================================================
429
430    /// Handle a message with security context.
431    ///
432    /// Provides authentication and tenant isolation for message handling.
433    /// Use this for operations that require authorization checks.
434    ///
435    /// # Arguments
436    ///
437    /// * `ctx` - Secure ring context with auth info
438    /// * `msg` - The incoming message
439    ///
440    /// # Returns
441    ///
442    /// The response message or an error.
443    ///
444    /// # Default Implementation
445    ///
446    /// Delegates to `handle()` ignoring security context. Override to
447    /// implement authorization checks.
448    async fn handle_secure(&self, ctx: &mut SecureRingContext<'_, '_>, msg: M) -> Result<R>
449    where
450        M: 'async_trait,
451        R: 'async_trait,
452    {
453        // Default: ignore security context, delegate to handle
454        self.handle(ctx.ring_ctx, msg).await
455    }
456}
457
458/// Trait for iterative (multi-pass) kernels.
459///
460/// Provides support for algorithms that require multiple iterations
461/// to converge (e.g., PageRank, K-Means).
462///
463/// # Type Parameters
464///
465/// - `S`: State type
466/// - `I`: Input type
467/// - `O`: Output type
468#[async_trait]
469pub trait IterativeKernel<S, I, O>: GpuKernel
470where
471    S: Send + Sync + 'static,
472    I: Send + Sync + 'static,
473    O: Send + Sync + 'static,
474{
475    /// Create the initial state.
476    fn initial_state(&self, input: &I) -> S;
477
478    /// Perform one iteration.
479    ///
480    /// # Arguments
481    ///
482    /// * `state` - The current state (mutable)
483    /// * `input` - The input data
484    ///
485    /// # Returns
486    ///
487    /// The iteration result.
488    async fn iterate(&self, state: &mut S, input: &I) -> Result<IterationResult<O>>;
489
490    /// Check if the algorithm has converged.
491    ///
492    /// # Arguments
493    ///
494    /// * `state` - The current state
495    /// * `threshold` - The convergence threshold
496    ///
497    /// # Returns
498    ///
499    /// `true` if converged, `false` otherwise.
500    fn converged(&self, state: &S, threshold: f64) -> bool;
501
502    /// Maximum number of iterations.
503    fn max_iterations(&self) -> usize {
504        100
505    }
506
507    /// Default convergence threshold.
508    fn default_threshold(&self) -> f64 {
509        1e-6
510    }
511
512    /// Run the iterative algorithm to convergence.
513    async fn run_to_convergence(&self, input: I) -> Result<O> {
514        self.run_to_convergence_with_threshold(input, self.default_threshold())
515            .await
516    }
517
518    /// Run the iterative algorithm with a custom threshold.
519    async fn run_to_convergence_with_threshold(&self, input: I, threshold: f64) -> Result<O> {
520        let mut state = self.initial_state(&input);
521        let max_iter = self.max_iterations();
522
523        for _ in 0..max_iter {
524            let result = self.iterate(&mut state, &input).await?;
525
526            if let IterationResult::Converged(output) = result {
527                return Ok(output);
528            }
529
530            if self.converged(&state, threshold) {
531                if let IterationResult::Continue(output) = result {
532                    return Ok(output);
533                }
534            }
535        }
536
537        // Return final state even if not converged
538        match self.iterate(&mut state, &input).await? {
539            IterationResult::Converged(output) | IterationResult::Continue(output) => Ok(output),
540        }
541    }
542}
543
544/// Result of a single iteration.
545#[derive(Debug, Clone)]
546pub enum IterationResult<O> {
547    /// Algorithm has converged with final output.
548    Converged(O),
549    /// Algorithm should continue; current intermediate output.
550    Continue(O),
551}
552
553impl<O> IterationResult<O> {
554    /// Returns true if converged.
555    #[must_use]
556    pub fn is_converged(&self) -> bool {
557        matches!(self, IterationResult::Converged(_))
558    }
559
560    /// Extract the output.
561    #[must_use]
562    pub fn into_output(self) -> O {
563        match self {
564            IterationResult::Converged(o) | IterationResult::Continue(o) => o,
565        }
566    }
567}
568
569/// Type-erased batch kernel for registry storage.
570#[async_trait]
571pub trait BatchKernelDyn: GpuKernel {
572    /// Execute with type-erased input/output.
573    async fn execute_dyn(&self, input: &[u8]) -> Result<Vec<u8>>;
574}
575
576/// Type-erased ring kernel for registry storage.
577#[async_trait]
578pub trait RingKernelDyn: GpuKernel {
579    /// Handle with type-erased messages.
580    async fn handle_dyn(&self, ctx: &mut RingContext, msg: &[u8]) -> Result<Vec<u8>>;
581}
582
583// ============================================================================
584// Enterprise Traits (0.3.1)
585// ============================================================================
586
587/// Trait for kernels that support checkpoint/restore.
588///
589/// Enables recovery from failures by saving and restoring kernel state.
590/// Useful for long-running or stateful kernels.
591///
592/// # Type Parameters
593///
594/// - `C`: Checkpoint type (must be serializable)
595#[async_trait]
596pub trait CheckpointableKernel: GpuKernel {
597    /// The checkpoint state type
598    type Checkpoint: Serialize + serde::de::DeserializeOwned + Send + Sync;
599
600    /// Create a checkpoint of current kernel state.
601    ///
602    /// # Returns
603    ///
604    /// A serializable checkpoint that can be used to restore state.
605    async fn checkpoint(&self) -> Result<Self::Checkpoint>;
606
607    /// Restore kernel state from a checkpoint.
608    ///
609    /// # Arguments
610    ///
611    /// * `checkpoint` - Previously saved checkpoint state
612    ///
613    /// # Returns
614    ///
615    /// Ok if state was restored, Err if checkpoint is invalid.
616    async fn restore(&mut self, checkpoint: Self::Checkpoint) -> Result<()>;
617
618    /// Check if checkpointing is currently safe.
619    ///
620    /// Returns false if the kernel is in the middle of an operation
621    /// that cannot be interrupted.
622    fn can_checkpoint(&self) -> bool {
623        true
624    }
625
626    /// Get the size of the checkpoint in bytes (estimate).
627    ///
628    /// Useful for monitoring and capacity planning.
629    fn checkpoint_size_estimate(&self) -> usize {
630        0
631    }
632}
633
634/// Trait for kernels that support graceful degradation.
635///
636/// When resources are constrained, these kernels can operate in
637/// a reduced-functionality mode rather than failing completely.
638pub trait DegradableKernel: GpuKernel {
639    /// Enter degraded mode.
640    ///
641    /// Called when resources are constrained. The kernel should
642    /// reduce functionality while remaining operational.
643    fn enter_degraded_mode(&mut self) -> Result<()>;
644
645    /// Exit degraded mode.
646    ///
647    /// Called when resources are restored. The kernel should
648    /// resume full functionality.
649    fn exit_degraded_mode(&mut self) -> Result<()>;
650
651    /// Check if kernel is in degraded mode.
652    fn is_degraded(&self) -> bool;
653
654    /// Get description of current degradation.
655    fn degradation_info(&self) -> Option<String> {
656        None
657    }
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663
664    #[test]
665    fn test_iteration_result() {
666        let converged: IterationResult<i32> = IterationResult::Converged(42);
667        assert!(converged.is_converged());
668        assert_eq!(converged.into_output(), 42);
669
670        let continuing: IterationResult<i32> = IterationResult::Continue(0);
671        assert!(!continuing.is_converged());
672    }
673
674    #[test]
675    fn test_health_status() {
676        assert_eq!(HealthStatus::default(), HealthStatus::Healthy);
677        assert_eq!(format!("{}", HealthStatus::Healthy), "healthy");
678        assert_eq!(format!("{}", HealthStatus::Degraded), "degraded");
679    }
680
681    #[test]
682    fn test_execution_context() {
683        let ctx = ExecutionContext::new()
684            .with_user("user123")
685            .with_tenant("tenant456")
686            .with_timeout(Duration::from_secs(30));
687
688        assert!(ctx.request_id.is_some());
689        assert_eq!(ctx.user_id.as_deref(), Some("user123"));
690        assert_eq!(ctx.tenant_id.as_deref(), Some("tenant456"));
691        assert_eq!(ctx.timeout, Some(Duration::from_secs(30)));
692    }
693
694    #[test]
695    fn test_kernel_config() {
696        let config = KernelConfig::new()
697            .with_queue_depth(1000)
698            .with_timeout(Duration::from_secs(60))
699            .with_tracing(true)
700            .with_metrics(true);
701
702        assert_eq!(config.max_queue_depth, Some(1000));
703        assert_eq!(config.timeout, Some(Duration::from_secs(60)));
704        assert!(config.tracing_enabled);
705        assert!(config.metrics_enabled);
706    }
707}