rustkernel_core/traits.rs
1//! Core kernel traits.
2//!
3//! This module defines the fundamental traits that all kernels implement:
4//! - `GpuKernel`: Base trait for all GPU kernels
5//! - `BatchKernel`: Trait for batch (CPU-orchestrated) kernels
6//! - `RingKernelHandler`: Trait for ring (persistent actor) kernels
7//! - `CheckpointableKernel`: Trait for kernels that support checkpoint/restore (0.3.1)
8//!
9//! ## Enterprise Features (0.3.1)
10//!
11//! - Health checking for liveness/readiness probes
12//! - Execution context with auth, tenant, and tracing
13//! - Secure message handling with authentication
14//! - Checkpoint/restore for recovery
15
16use crate::error::Result;
17use crate::kernel::KernelMetadata;
18use async_trait::async_trait;
19use ringkernel_core::{RingContext, RingMessage};
20use serde::{Deserialize, Serialize};
21use std::fmt::Debug;
22use std::time::Duration;
23use uuid::Uuid;
24
25// ============================================================================
26// Health & Status Types
27// ============================================================================
28
29/// Health status for kernel health checks
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
31pub enum HealthStatus {
32 /// Kernel is healthy and operational
33 #[default]
34 Healthy,
35 /// Kernel is degraded but still operational
36 Degraded,
37 /// Kernel is unhealthy and should not receive traffic
38 Unhealthy,
39 /// Health status is unknown (check failed)
40 Unknown,
41}
42
43impl std::fmt::Display for HealthStatus {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 match self {
46 Self::Healthy => write!(f, "healthy"),
47 Self::Degraded => write!(f, "degraded"),
48 Self::Unhealthy => write!(f, "unhealthy"),
49 Self::Unknown => write!(f, "unknown"),
50 }
51 }
52}
53
54// ============================================================================
55// Execution Context Types
56// ============================================================================
57
58/// Execution context for kernel invocations.
59///
60/// Provides authentication, tenant isolation, and distributed tracing context
61/// for kernel execution.
62#[derive(Debug, Clone, Default)]
63pub struct ExecutionContext {
64 /// Request ID for tracing
65 pub request_id: Option<Uuid>,
66 /// Trace ID for distributed tracing
67 pub trace_id: Option<String>,
68 /// Span ID for distributed tracing
69 pub span_id: Option<String>,
70 /// Authenticated user ID (if any)
71 pub user_id: Option<String>,
72 /// Tenant ID for multi-tenancy
73 pub tenant_id: Option<String>,
74 /// Request timeout (if specified)
75 pub timeout: Option<Duration>,
76 /// Additional metadata
77 pub metadata: std::collections::HashMap<String, String>,
78}
79
80impl ExecutionContext {
81 /// Create a new execution context
82 pub fn new() -> Self {
83 Self {
84 request_id: Some(Uuid::new_v4()),
85 ..Default::default()
86 }
87 }
88
89 /// Create context with request ID
90 pub fn with_request_id(mut self, id: Uuid) -> Self {
91 self.request_id = Some(id);
92 self
93 }
94
95 /// Set trace context
96 pub fn with_trace(mut self, trace_id: impl Into<String>, span_id: impl Into<String>) -> Self {
97 self.trace_id = Some(trace_id.into());
98 self.span_id = Some(span_id.into());
99 self
100 }
101
102 /// Set authenticated user
103 pub fn with_user(mut self, user_id: impl Into<String>) -> Self {
104 self.user_id = Some(user_id.into());
105 self
106 }
107
108 /// Set tenant
109 pub fn with_tenant(mut self, tenant_id: impl Into<String>) -> Self {
110 self.tenant_id = Some(tenant_id.into());
111 self
112 }
113
114 /// Set timeout
115 pub fn with_timeout(mut self, timeout: Duration) -> Self {
116 self.timeout = Some(timeout);
117 self
118 }
119
120 /// Add metadata
121 pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
122 self.metadata.insert(key.into(), value.into());
123 self
124 }
125}
126
127/// Secure ring context with authentication.
128///
129/// Wraps `RingContext` with security context for authenticated message handling.
130pub struct SecureRingContext<'ctx, 'ring> {
131 /// The underlying ring context
132 pub ring_ctx: &'ctx mut RingContext<'ring>,
133 /// Execution context with auth info
134 pub exec_ctx: &'ctx ExecutionContext,
135}
136
137impl<'ctx, 'ring> SecureRingContext<'ctx, 'ring> {
138 /// Create a new secure context
139 pub fn new(ring_ctx: &'ctx mut RingContext<'ring>, exec_ctx: &'ctx ExecutionContext) -> Self {
140 Self { ring_ctx, exec_ctx }
141 }
142
143 /// Get the authenticated user ID
144 pub fn user_id(&self) -> Option<&str> {
145 self.exec_ctx.user_id.as_deref()
146 }
147
148 /// Get the tenant ID
149 pub fn tenant_id(&self) -> Option<&str> {
150 self.exec_ctx.tenant_id.as_deref()
151 }
152
153 /// Check if request is authenticated
154 pub fn is_authenticated(&self) -> bool {
155 self.exec_ctx.user_id.is_some()
156 }
157}
158
159// ============================================================================
160// Kernel Configuration
161// ============================================================================
162
163/// Runtime configuration for a kernel instance.
164#[derive(Debug, Clone, Default, Serialize, Deserialize)]
165pub struct KernelConfig {
166 /// Maximum queue depth
167 pub max_queue_depth: Option<usize>,
168 /// Execution timeout
169 pub timeout: Option<Duration>,
170 /// Enable tracing
171 pub tracing_enabled: bool,
172 /// Enable metrics collection
173 pub metrics_enabled: bool,
174 /// Custom configuration values
175 pub custom: std::collections::HashMap<String, serde_json::Value>,
176}
177
178impl KernelConfig {
179 /// Create a new kernel config
180 pub fn new() -> Self {
181 Self::default()
182 }
183
184 /// Set queue depth
185 pub fn with_queue_depth(mut self, depth: usize) -> Self {
186 self.max_queue_depth = Some(depth);
187 self
188 }
189
190 /// Set timeout
191 pub fn with_timeout(mut self, timeout: Duration) -> Self {
192 self.timeout = Some(timeout);
193 self
194 }
195
196 /// Enable tracing
197 pub fn with_tracing(mut self, enabled: bool) -> Self {
198 self.tracing_enabled = enabled;
199 self
200 }
201
202 /// Enable metrics
203 pub fn with_metrics(mut self, enabled: bool) -> Self {
204 self.metrics_enabled = enabled;
205 self
206 }
207
208 /// Set custom value
209 pub fn with_custom(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
210 self.custom.insert(key.into(), value);
211 self
212 }
213}
214
215// ============================================================================
216// Core Kernel Traits
217// ============================================================================
218
219/// Base trait for all GPU kernels.
220///
221/// Provides access to kernel metadata, health checking, and lifecycle management.
222///
223/// ## Enterprise Features (0.3.1)
224///
225/// - `health_check()` - Report kernel health for liveness/readiness probes
226/// - `shutdown()` - Graceful shutdown with resource cleanup
227/// - `refresh_config()` - Hot configuration reload
228pub trait GpuKernel: Send + Sync + Debug {
229 /// Returns the kernel metadata.
230 fn metadata(&self) -> &KernelMetadata;
231
232 /// Validate kernel configuration.
233 ///
234 /// Called before kernel launch to ensure configuration is valid.
235 fn validate(&self) -> Result<()> {
236 Ok(())
237 }
238
239 /// Returns the kernel ID.
240 fn id(&self) -> &str {
241 &self.metadata().id
242 }
243
244 /// Returns true if this kernel requires GPU-native execution.
245 fn requires_gpu_native(&self) -> bool {
246 self.metadata().requires_gpu_native
247 }
248
249 // ========================================================================
250 // Enterprise Features (0.3.1)
251 // ========================================================================
252
253 /// Perform a health check on this kernel.
254 ///
255 /// Used by liveness and readiness probes. Override to implement
256 /// custom health checking logic (e.g., checking GPU memory, connections).
257 ///
258 /// # Returns
259 ///
260 /// The current health status of the kernel.
261 fn health_check(&self) -> HealthStatus {
262 HealthStatus::Healthy
263 }
264
265 /// Graceful shutdown of the kernel.
266 ///
267 /// Called during runtime shutdown to release resources. Override to
268 /// implement custom cleanup (e.g., flushing buffers, closing connections).
269 ///
270 /// Default implementation does nothing.
271 fn shutdown(&self) -> Result<()> {
272 Ok(())
273 }
274
275 /// Refresh kernel configuration at runtime.
276 ///
277 /// Called when configuration is hot-reloaded. Only safe-to-reload
278 /// configuration values should be applied.
279 ///
280 /// # Arguments
281 ///
282 /// * `config` - The new configuration to apply
283 ///
284 /// # Returns
285 ///
286 /// Ok if configuration was applied, Err if configuration is invalid.
287 fn refresh_config(&mut self, _config: &KernelConfig) -> Result<()> {
288 Ok(())
289 }
290}
291
292/// Trait for batch (CPU-orchestrated) kernels.
293///
294/// Batch kernels are launched on-demand with CPU orchestration.
295/// They have 10-50μs launch overhead and state resides in CPU memory.
296///
297/// ## Enterprise Features (0.3.1)
298///
299/// - `execute_with_context()` - Execute with auth, tenant, and tracing context
300/// - `execute_with_timeout()` - Execute with deadline enforcement
301///
302/// # Type Parameters
303///
304/// - `I`: Input type
305/// - `O`: Output type
306#[async_trait]
307pub trait BatchKernel<I, O>: GpuKernel
308where
309 I: Send + Sync,
310 O: Send + Sync,
311{
312 /// Execute the kernel with the given input.
313 ///
314 /// # Arguments
315 ///
316 /// * `input` - The input data for the kernel
317 ///
318 /// # Returns
319 ///
320 /// The kernel output or an error.
321 async fn execute(&self, input: I) -> Result<O>;
322
323 /// Validate the input before execution.
324 ///
325 /// Override to provide custom input validation.
326 fn validate_input(&self, _input: &I) -> Result<()> {
327 Ok(())
328 }
329
330 // ========================================================================
331 // Enterprise Features (0.3.1)
332 // ========================================================================
333
334 /// Execute the kernel with execution context.
335 ///
336 /// Provides authentication, tenant isolation, and distributed tracing
337 /// context for the kernel execution.
338 ///
339 /// # Arguments
340 ///
341 /// * `ctx` - The execution context with auth, tenant, and tracing info
342 /// * `input` - The input data for the kernel
343 ///
344 /// # Returns
345 ///
346 /// The kernel output or an error.
347 ///
348 /// # Default Implementation
349 ///
350 /// Delegates to `execute()` ignoring the context. Override to use context.
351 async fn execute_with_context(&self, ctx: &ExecutionContext, input: I) -> Result<O>
352 where
353 I: 'async_trait,
354 {
355 // Default: ignore context, just execute
356 let _ = ctx;
357 self.execute(input).await
358 }
359
360 /// Execute the kernel with a timeout.
361 ///
362 /// # Arguments
363 ///
364 /// * `input` - The input data for the kernel
365 /// * `timeout` - Maximum execution time
366 ///
367 /// # Returns
368 ///
369 /// The kernel output or a timeout error.
370 async fn execute_with_timeout(&self, input: I, timeout: Duration) -> Result<O>
371 where
372 I: 'async_trait,
373 {
374 match tokio::time::timeout(timeout, self.execute(input)).await {
375 Ok(result) => result,
376 Err(_elapsed) => Err(crate::error::KernelError::Timeout(timeout)),
377 }
378 }
379}
380
381/// Trait for ring (persistent actor) kernels.
382///
383/// Ring kernels are persistent GPU actors with 100-500ns message latency.
384/// State resides permanently in GPU memory.
385///
386/// ## Enterprise Features (0.3.1)
387///
388/// - `handle_secure()` - Handle messages with security context
389///
390/// # Type Parameters
391///
392/// - `M`: Request message type
393/// - `R`: Response message type
394#[async_trait]
395pub trait RingKernelHandler<M, R>: GpuKernel
396where
397 M: RingMessage + Send + Sync,
398 R: RingMessage + Send + Sync,
399{
400 /// Handle an incoming message.
401 ///
402 /// # Arguments
403 ///
404 /// * `ctx` - The ring kernel context with GPU intrinsics
405 /// * `msg` - The incoming message
406 ///
407 /// # Returns
408 ///
409 /// The response message or an error.
410 async fn handle(&self, ctx: &mut RingContext, msg: M) -> Result<R>;
411
412 /// Initialize the kernel state.
413 ///
414 /// Called once when the kernel is first activated.
415 async fn initialize(&self, _ctx: &mut RingContext) -> Result<()> {
416 Ok(())
417 }
418
419 /// Called when the kernel is being shut down.
420 ///
421 /// Use this to clean up resources.
422 async fn ring_shutdown(&self, _ctx: &mut RingContext) -> Result<()> {
423 Ok(())
424 }
425
426 // ========================================================================
427 // Enterprise Features (0.3.1)
428 // ========================================================================
429
430 /// Handle a message with security context.
431 ///
432 /// Provides authentication and tenant isolation for message handling.
433 /// Use this for operations that require authorization checks.
434 ///
435 /// # Arguments
436 ///
437 /// * `ctx` - Secure ring context with auth info
438 /// * `msg` - The incoming message
439 ///
440 /// # Returns
441 ///
442 /// The response message or an error.
443 ///
444 /// # Default Implementation
445 ///
446 /// Delegates to `handle()` ignoring security context. Override to
447 /// implement authorization checks.
448 async fn handle_secure(&self, ctx: &mut SecureRingContext<'_, '_>, msg: M) -> Result<R>
449 where
450 M: 'async_trait,
451 R: 'async_trait,
452 {
453 // Default: ignore security context, delegate to handle
454 self.handle(ctx.ring_ctx, msg).await
455 }
456}
457
458/// Trait for iterative (multi-pass) kernels.
459///
460/// Provides support for algorithms that require multiple iterations
461/// to converge (e.g., PageRank, K-Means).
462///
463/// # Type Parameters
464///
465/// - `S`: State type
466/// - `I`: Input type
467/// - `O`: Output type
468#[async_trait]
469pub trait IterativeKernel<S, I, O>: GpuKernel
470where
471 S: Send + Sync + 'static,
472 I: Send + Sync + 'static,
473 O: Send + Sync + 'static,
474{
475 /// Create the initial state.
476 fn initial_state(&self, input: &I) -> S;
477
478 /// Perform one iteration.
479 ///
480 /// # Arguments
481 ///
482 /// * `state` - The current state (mutable)
483 /// * `input` - The input data
484 ///
485 /// # Returns
486 ///
487 /// The iteration result.
488 async fn iterate(&self, state: &mut S, input: &I) -> Result<IterationResult<O>>;
489
490 /// Check if the algorithm has converged.
491 ///
492 /// # Arguments
493 ///
494 /// * `state` - The current state
495 /// * `threshold` - The convergence threshold
496 ///
497 /// # Returns
498 ///
499 /// `true` if converged, `false` otherwise.
500 fn converged(&self, state: &S, threshold: f64) -> bool;
501
502 /// Maximum number of iterations.
503 fn max_iterations(&self) -> usize {
504 100
505 }
506
507 /// Default convergence threshold.
508 fn default_threshold(&self) -> f64 {
509 1e-6
510 }
511
512 /// Run the iterative algorithm to convergence.
513 async fn run_to_convergence(&self, input: I) -> Result<O> {
514 self.run_to_convergence_with_threshold(input, self.default_threshold())
515 .await
516 }
517
518 /// Run the iterative algorithm with a custom threshold.
519 async fn run_to_convergence_with_threshold(&self, input: I, threshold: f64) -> Result<O> {
520 let mut state = self.initial_state(&input);
521 let max_iter = self.max_iterations();
522
523 for _ in 0..max_iter {
524 let result = self.iterate(&mut state, &input).await?;
525
526 if let IterationResult::Converged(output) = result {
527 return Ok(output);
528 }
529
530 if self.converged(&state, threshold) {
531 if let IterationResult::Continue(output) = result {
532 return Ok(output);
533 }
534 }
535 }
536
537 // Return final state even if not converged
538 match self.iterate(&mut state, &input).await? {
539 IterationResult::Converged(output) | IterationResult::Continue(output) => Ok(output),
540 }
541 }
542}
543
544/// Result of a single iteration.
545#[derive(Debug, Clone)]
546pub enum IterationResult<O> {
547 /// Algorithm has converged with final output.
548 Converged(O),
549 /// Algorithm should continue; current intermediate output.
550 Continue(O),
551}
552
553impl<O> IterationResult<O> {
554 /// Returns true if converged.
555 #[must_use]
556 pub fn is_converged(&self) -> bool {
557 matches!(self, IterationResult::Converged(_))
558 }
559
560 /// Extract the output.
561 #[must_use]
562 pub fn into_output(self) -> O {
563 match self {
564 IterationResult::Converged(o) | IterationResult::Continue(o) => o,
565 }
566 }
567}
568
569/// Type-erased batch kernel for registry storage.
570#[async_trait]
571pub trait BatchKernelDyn: GpuKernel {
572 /// Execute with type-erased input/output.
573 async fn execute_dyn(&self, input: &[u8]) -> Result<Vec<u8>>;
574}
575
576/// Type-erased ring kernel for registry storage.
577#[async_trait]
578pub trait RingKernelDyn: GpuKernel {
579 /// Handle with type-erased messages.
580 async fn handle_dyn(&self, ctx: &mut RingContext, msg: &[u8]) -> Result<Vec<u8>>;
581}
582
583// ============================================================================
584// Enterprise Traits (0.3.1)
585// ============================================================================
586
587/// Trait for kernels that support checkpoint/restore.
588///
589/// Enables recovery from failures by saving and restoring kernel state.
590/// Useful for long-running or stateful kernels.
591///
592/// # Type Parameters
593///
594/// - `C`: Checkpoint type (must be serializable)
595#[async_trait]
596pub trait CheckpointableKernel: GpuKernel {
597 /// The checkpoint state type
598 type Checkpoint: Serialize + serde::de::DeserializeOwned + Send + Sync;
599
600 /// Create a checkpoint of current kernel state.
601 ///
602 /// # Returns
603 ///
604 /// A serializable checkpoint that can be used to restore state.
605 async fn checkpoint(&self) -> Result<Self::Checkpoint>;
606
607 /// Restore kernel state from a checkpoint.
608 ///
609 /// # Arguments
610 ///
611 /// * `checkpoint` - Previously saved checkpoint state
612 ///
613 /// # Returns
614 ///
615 /// Ok if state was restored, Err if checkpoint is invalid.
616 async fn restore(&mut self, checkpoint: Self::Checkpoint) -> Result<()>;
617
618 /// Check if checkpointing is currently safe.
619 ///
620 /// Returns false if the kernel is in the middle of an operation
621 /// that cannot be interrupted.
622 fn can_checkpoint(&self) -> bool {
623 true
624 }
625
626 /// Get the size of the checkpoint in bytes (estimate).
627 ///
628 /// Useful for monitoring and capacity planning.
629 fn checkpoint_size_estimate(&self) -> usize {
630 0
631 }
632}
633
634/// Trait for kernels that support graceful degradation.
635///
636/// When resources are constrained, these kernels can operate in
637/// a reduced-functionality mode rather than failing completely.
638pub trait DegradableKernel: GpuKernel {
639 /// Enter degraded mode.
640 ///
641 /// Called when resources are constrained. The kernel should
642 /// reduce functionality while remaining operational.
643 fn enter_degraded_mode(&mut self) -> Result<()>;
644
645 /// Exit degraded mode.
646 ///
647 /// Called when resources are restored. The kernel should
648 /// resume full functionality.
649 fn exit_degraded_mode(&mut self) -> Result<()>;
650
651 /// Check if kernel is in degraded mode.
652 fn is_degraded(&self) -> bool;
653
654 /// Get description of current degradation.
655 fn degradation_info(&self) -> Option<String> {
656 None
657 }
658}
659
660#[cfg(test)]
661mod tests {
662 use super::*;
663
664 #[test]
665 fn test_iteration_result() {
666 let converged: IterationResult<i32> = IterationResult::Converged(42);
667 assert!(converged.is_converged());
668 assert_eq!(converged.into_output(), 42);
669
670 let continuing: IterationResult<i32> = IterationResult::Continue(0);
671 assert!(!continuing.is_converged());
672 }
673
674 #[test]
675 fn test_health_status() {
676 assert_eq!(HealthStatus::default(), HealthStatus::Healthy);
677 assert_eq!(format!("{}", HealthStatus::Healthy), "healthy");
678 assert_eq!(format!("{}", HealthStatus::Degraded), "degraded");
679 }
680
681 #[test]
682 fn test_execution_context() {
683 let ctx = ExecutionContext::new()
684 .with_user("user123")
685 .with_tenant("tenant456")
686 .with_timeout(Duration::from_secs(30));
687
688 assert!(ctx.request_id.is_some());
689 assert_eq!(ctx.user_id.as_deref(), Some("user123"));
690 assert_eq!(ctx.tenant_id.as_deref(), Some("tenant456"));
691 assert_eq!(ctx.timeout, Some(Duration::from_secs(30)));
692 }
693
694 #[test]
695 fn test_kernel_config() {
696 let config = KernelConfig::new()
697 .with_queue_depth(1000)
698 .with_timeout(Duration::from_secs(60))
699 .with_tracing(true)
700 .with_metrics(true);
701
702 assert_eq!(config.max_queue_depth, Some(1000));
703 assert_eq!(config.timeout, Some(Duration::from_secs(60)));
704 assert!(config.tracing_enabled);
705 assert!(config.metrics_enabled);
706 }
707}