pub struct CheckpointManager { /* private fields */ }Expand description
Manages periodic checkpointing for persistent GPU actors.
The CheckpointManager orchestrates the checkpoint lifecycle:
- Periodically determines when a snapshot is due
- Issues
SnapshotRequests (caller sends as H2K commands) - Processes
SnapshotResponses (caller feeds from K2H responses) - Persists completed checkpoints to storage
- Enforces retention policy (deletes old checkpoints)
§Usage
use ringkernel_core::checkpoint::{CheckpointConfig, CheckpointManager};
use std::time::Duration;
let config = CheckpointConfig::new(Duration::from_secs(10))
.with_max_snapshots(3)
.with_storage_path("/tmp/checkpoints");
let mut manager = CheckpointManager::new(config);
manager.register_actor(0, "wave_sim_0", "fdtd_3d");
// In your poll loop:
for request in manager.poll_due_snapshots() {
// Send as H2K SnapshotActor command
h2k_queue.send(H2KMessage::snapshot_actor(
request.request_id,
request.actor_slot,
request.buffer_offset,
));
}
// When K2H SnapshotComplete arrives:
manager.complete_snapshot(SnapshotResponse { ... })?;Implementations§
Source§impl CheckpointManager
impl CheckpointManager
Sourcepub fn new(config: CheckpointConfig) -> Self
pub fn new(config: CheckpointConfig) -> Self
Create a new checkpoint manager with file storage at the configured path.
Sourcepub fn with_storage(
config: CheckpointConfig,
storage: Box<dyn CheckpointStorage>,
) -> Self
pub fn with_storage( config: CheckpointConfig, storage: Box<dyn CheckpointStorage>, ) -> Self
Create a checkpoint manager with a custom storage backend.
Sourcepub fn register_actor(
&mut self,
actor_slot: u32,
kernel_id: impl Into<String>,
kernel_type: impl Into<String>,
)
pub fn register_actor( &mut self, actor_slot: u32, kernel_id: impl Into<String>, kernel_type: impl Into<String>, )
Register an actor for periodic checkpointing.
Sourcepub fn unregister_actor(&mut self, actor_slot: u32)
pub fn unregister_actor(&mut self, actor_slot: u32)
Unregister an actor from checkpointing.
Sourcepub fn is_enabled(&self) -> bool
pub fn is_enabled(&self) -> bool
Check if checkpointing is enabled.
Sourcepub fn config(&self) -> &CheckpointConfig
pub fn config(&self) -> &CheckpointConfig
Get the checkpoint configuration.
Sourcepub fn pending_count(&self) -> usize
pub fn pending_count(&self) -> usize
Get the number of pending snapshot requests.
Sourcepub fn total_completed(&self) -> u64
pub fn total_completed(&self) -> u64
Get total completed snapshots.
Sourcepub fn total_failed(&self) -> u64
pub fn total_failed(&self) -> u64
Get total failed snapshots.
Sourcepub fn poll_due_snapshots(&mut self) -> Vec<SnapshotRequest>
pub fn poll_due_snapshots(&mut self) -> Vec<SnapshotRequest>
Poll for actors that are due for a snapshot.
Returns a list of SnapshotRequests that should be sent to the device
as H2K SnapshotActor commands.
Each actor is only requested once per interval, and only if no prior request for that actor is still pending.
Sourcepub fn complete_snapshot(
&mut self,
response: SnapshotResponse,
) -> Result<Option<String>>
pub fn complete_snapshot( &mut self, response: SnapshotResponse, ) -> Result<Option<String>>
Process a completed snapshot response from the device.
If the snapshot succeeded, the data is persisted to storage and the retention policy is enforced.
Returns the checkpoint name on success.
Sourcepub fn request_snapshot(&mut self, actor_slot: u32) -> Option<SnapshotRequest>
pub fn request_snapshot(&mut self, actor_slot: u32) -> Option<SnapshotRequest>
Manually request a snapshot for a specific actor, bypassing the interval timer.
This is useful for on-demand snapshots (e.g., before a risky operation)
or in tests. Returns None if the actor is not registered.
Sourcepub fn cancel_pending(&mut self, request_id: u64) -> bool
pub fn cancel_pending(&mut self, request_id: u64) -> bool
Cancel a pending snapshot request.
Returns true if the request was found and cancelled.
Sourcepub fn cancel_all_pending(&mut self)
pub fn cancel_all_pending(&mut self)
Cancel all pending snapshot requests.
Sourcepub fn load_latest(&self, actor_slot: u32) -> Result<Option<Checkpoint>>
pub fn load_latest(&self, actor_slot: u32) -> Result<Option<Checkpoint>>
Load the most recent checkpoint for an actor.
Sourcepub fn list_checkpoints(&self, actor_slot: u32) -> Result<Vec<String>>
pub fn list_checkpoints(&self, actor_slot: u32) -> Result<Vec<String>>
List all checkpoint names for an actor.
Sourcepub fn storage(&self) -> &dyn CheckpointStorage
pub fn storage(&self) -> &dyn CheckpointStorage
Get a reference to the storage backend.