Skip to main content

deepstrike_core/runtime/
snapshot.rs

1//! W2-2: First-class `KernelSnapshot` — live kernel state serialization.
2//!
3//! Enables:
4//! - Crash recovery (save state, restart, restore)
5//! - Agent migration (move running agent between hosts)
6//! - Checkpointing for long-running tasks
7//! - Cross-session state persistence
8//!
9//! The snapshot captures the essential kernel state needed to resume execution:
10//! - TaskTable (all TCBs with budget, wait state, proc info)
11//! - Context summary (messages, task state, signals)
12//! - Turn counter and budget totals
13//!
14//! NOT captured (recreated on restore):
15//! - SignalRouter dedup set (cleared on restore)
16//! - Governance pipeline closures (recreated from policy data)
17//! - MilestoneTracker state (recreated from config)
18//! - HandleTable (recreated from context history)
19
20use serde::{Deserialize, Serialize};
21
22use crate::scheduler::tcb::{TaskId, TaskState, Tcb, TaskTable};
23use crate::types::agent::{AgentIsolation, AgentRole, ContextInheritance};
24use crate::types::message::Message;
25
26/// Serializable snapshot of a TCB (Task Control Block).
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct TcbSnapshot {
29    pub id: TaskId,
30    pub parent: Option<TaskId>,
31    pub state: TaskState,
32    pub turns: u32,
33    pub total_tokens: u64,
34    pub started_at_ms: Option<u64>,
35    pub max_tokens: u32,
36    pub max_turns: u32,
37    pub max_total_tokens: u64,
38    pub max_wall_ms: Option<u64>,
39    pub wait_reason: Option<String>,  // Serialized as string label
40    pub wait_children: Option<Vec<TaskId>>,  // For SubAgentJoin
41    pub deferred_until: Option<u64>,
42    pub caps: Vec<TaskId>,
43    pub proc: Option<ProcInfoSnapshot>,
44}
45
46/// Snapshot of ProcInfo (sub-agent identity).
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct ProcInfoSnapshot {
49    pub parent_session_id: TaskId,
50    pub role: AgentRole,
51    pub isolation: AgentIsolation,
52    pub context_inheritance: ContextInheritance,
53    pub result: Option<ResultSnapshot>,
54}
55
56/// Snapshot of SubAgentResult.
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct ResultSnapshot {
59    pub termination: String,
60}
61
62/// Snapshot of context state (simplified representation).
63#[derive(Debug, Clone, Serialize, Deserialize, Default)]
64pub struct ContextSnapshot {
65    pub system_messages: Vec<Message>,
66    pub knowledge_messages: Vec<Message>,
67    pub task_goal: Option<String>,
68    pub task_plan: Option<String>,
69    pub task_progress: Option<String>,
70    pub task_open_steps: Vec<String>,
71    /// Durable user directives — preserved across snapshot/restore like goal/plan.
72    #[serde(default)]
73    pub task_directives: Vec<String>,
74    pub history_messages: Vec<Message>,
75    pub signals: Vec<String>,
76    pub max_tokens: u32,
77    pub sprint: u32,
78}
79
80impl ContextSnapshot {
81    /// Create a snapshot from context manager state.
82    pub fn from_context(ctx: &crate::context::manager::ContextManager) -> Self {
83        // Convert plan steps to JSON string representation
84        let task_plan = if ctx.partitions.task_state.plan.is_empty() {
85            None
86        } else {
87            serde_json::to_string(&ctx.partitions.task_state.plan).ok()
88        };
89
90        Self {
91            system_messages: ctx.partitions.system.messages.clone(),
92            knowledge_messages: ctx.partitions.knowledge.messages.clone(),
93            task_goal: Some(ctx.partitions.task_state.goal.clone()),
94            task_plan,
95            task_progress: Some(ctx.partitions.task_state.progress.clone()),
96            task_open_steps: ctx.partitions.task_state.open_steps(),
97            task_directives: ctx.partitions.task_state.directives.clone(),
98            history_messages: ctx.partitions.history.messages.clone(),
99            signals: ctx.partitions.signals.clone(),
100            max_tokens: ctx.max_tokens,
101            sprint: ctx.sprint,
102        }
103    }
104}
105
106/// Full kernel snapshot.
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct KernelSnapshot {
109    pub turn: u32,
110    pub total_tokens: u64,
111    pub tasks: Vec<TcbSnapshot>,
112    pub context: ContextSnapshot,
113    pub run_spec: Option<String>,  // JSON-encoded AgentRunSpec
114}
115
116impl KernelSnapshot {
117    /// Create a snapshot from kernel state components.
118    pub fn from_state(
119        turn: u32,
120        total_tokens: u64,
121        tasks: &TaskTable,
122        context: &ContextSnapshot,
123        run_spec: Option<&crate::AgentRunSpec>,
124    ) -> Self {
125        Self {
126            turn,
127            total_tokens,
128            tasks: tasks.all().iter().map(TcbSnapshot::from).collect(),
129            context: context.clone(),
130            run_spec: run_spec.and_then(|s| serde_json::to_string(s).ok()),
131        }
132    }
133
134    /// Convert back to AgentRunSpec if present.
135    pub fn run_spec(&self) -> Option<crate::AgentRunSpec> {
136        self.run_spec.as_ref().and_then(|s| serde_json::from_str(s).ok())
137    }
138
139    /// W2-2: Convert to OsSnapshot-compatible process records for verification with
140    /// `rebuild_os_snapshot_from_events`. This enables cross-validation between the
141    /// live state snapshot and the event-log-derived audit view.
142    pub fn to_os_process_records(&self) -> Vec<crate::runtime::replay::ProcessRecord> {
143        use crate::runtime::replay::ProcessRecord;
144        let mut records = Vec::new();
145        for tcb_snap in &self.tasks {
146            // Only include tasks with ProcInfo (sub-agents), not the root task
147            if let Some(proc) = &tcb_snap.proc {
148                records.push(ProcessRecord {
149                    turn: tcb_snap.turns,
150                    agent_id: tcb_snap.id.to_string(),
151                    parent_session_id: proc.parent_session_id.to_string(),
152                    state: tcb_snap.state.label().to_string(),
153                });
154            }
155        }
156        records
157    }
158
159    /// W2-2: Restore TCB from snapshot. Returns None if the snapshot data is invalid.
160    pub fn restore_tcb(&self, snapshot: &TcbSnapshot) -> Option<Tcb> {
161        use crate::scheduler::policy::SchedulerBudget;
162
163        // Reconstruct BudgetLedger limits
164        let limits = SchedulerBudget {
165            max_tokens: snapshot.max_tokens,
166            max_turns: snapshot.max_turns,
167            max_total_tokens: snapshot.max_total_tokens,
168            max_wall_ms: snapshot.max_wall_ms,
169        };
170
171        // Reconstruct wait reason from label
172        let wait = snapshot.wait_reason.as_ref().and_then(|label| match label.as_str() {
173            "approval" => Some(crate::scheduler::tcb::WaitReason::Approval),
174            "sub_agent_join" => snapshot.wait_children.as_ref().map(|children| {
175                crate::scheduler::tcb::WaitReason::SubAgentJoin(
176                    children.iter().map(|id| id.clone().into()).collect()
177                )
178            }),
179            "tool" => Some(crate::scheduler::tcb::WaitReason::Tool),
180            "milestone" => Some(crate::scheduler::tcb::WaitReason::Milestone),
181            "signal" => Some(crate::scheduler::tcb::WaitReason::Signal),
182            "external" => Some(crate::scheduler::tcb::WaitReason::External),
183            _ => None,
184        });
185
186        // Reconstruct ProcInfo if present
187        let proc = snapshot.proc.as_ref().and_then(|p| {
188            let result = p.result.as_ref().and_then(|r| {
189                // Parse termination string back to TerminationReason
190                match r.termination.as_str() {
191                    "\"Completed\"" | "Completed" => Some(crate::types::result::SubAgentResult {
192                        agent_id: snapshot.id.clone(),
193                        result: crate::types::result::LoopResult {
194                            termination: crate::types::result::TerminationReason::Completed,
195                            final_message: None,
196                            turns_used: 0,
197                            total_tokens_used: 0,
198                            loop_continue: None,
199                            classify_branch: None,
200                            tournament_winner: None,
201                        },
202                    }),
203                    _ => None,
204                }
205            });
206
207            Some(crate::scheduler::tcb::ProcInfo {
208                parent_session_id: p.parent_session_id.clone(),
209                role: p.role,
210                isolation: p.isolation,
211                context_inheritance: p.context_inheritance,
212                result,
213            })
214        });
215
216        Some(Tcb {
217            id: snapshot.id.clone(),
218            parent: snapshot.parent.clone(),
219            state: snapshot.state,
220            budget: crate::scheduler::tcb::BudgetLedger {
221                limits,
222                turns: snapshot.turns,
223                total_tokens: snapshot.total_tokens,
224                started_at_ms: snapshot.started_at_ms,
225            },
226            wait,
227            caps: snapshot.caps.clone(),
228            proc,
229            deferred_until: snapshot.deferred_until,
230        })
231    }
232}
233
234impl From<&Tcb> for TcbSnapshot {
235    fn from(tcb: &Tcb) -> Self {
236        Self {
237            id: tcb.id.clone(),
238            parent: tcb.parent.clone(),
239            state: tcb.state.clone(),
240            turns: tcb.budget.turns,
241            total_tokens: tcb.budget.total_tokens,
242            started_at_ms: tcb.budget.started_at_ms,
243            max_tokens: tcb.budget.limits.max_tokens,
244            max_turns: tcb.budget.limits.max_turns,
245            max_total_tokens: tcb.budget.limits.max_total_tokens,
246            max_wall_ms: tcb.budget.limits.max_wall_ms,
247            wait_reason: tcb.wait.as_ref().map(|w| w.label().to_string()),
248            wait_children: match &tcb.wait {
249                Some(crate::scheduler::tcb::WaitReason::SubAgentJoin(children)) => {
250                    Some(children.clone())
251                }
252                _ => None,
253            },
254            deferred_until: tcb.deferred_until,
255            caps: tcb.caps.clone(),
256            proc: tcb.proc.as_ref().map(|p| ProcInfoSnapshot {
257                parent_session_id: p.parent_session_id.clone(),
258                role: p.role,
259                isolation: p.isolation,
260                context_inheritance: p.context_inheritance,
261                result: p.result.as_ref().map(|r| ResultSnapshot {
262                    termination: format!("{:?}", r.result.termination),
263                }),
264            }),
265        }
266    }
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272    use crate::scheduler::policy::SchedulerBudget;
273
274    #[test]
275    fn tcb_snapshot_roundtrip() {
276        let mut tcb = Tcb::root("test-task", SchedulerBudget {
277            max_tokens: 128_000,
278            max_turns: 10,
279            max_total_tokens: 1000,
280            max_wall_ms: Some(60000),
281        });
282        tcb.budget.turns = 5;
283        tcb.budget.total_tokens = 500;
284        tcb.deferred_until = Some(1000);
285
286        let snapshot = TcbSnapshot::from(&tcb);
287        assert_eq!(snapshot.id.as_str(), "test-task");
288        assert_eq!(snapshot.turns, 5);
289        assert_eq!(snapshot.total_tokens, 500);
290        assert_eq!(snapshot.deferred_until, Some(1000));
291    }
292
293    #[test]
294    fn kernel_snapshot_serializes() {
295        let snap = KernelSnapshot {
296            turn: 1,
297            total_tokens: 100,
298            tasks: vec![],
299            context: ContextSnapshot::default(),
300            run_spec: None,
301        };
302
303        let json = serde_json::to_string(&snap).expect("serialize");
304        let restored: KernelSnapshot = serde_json::from_str(&json).expect("deserialize");
305
306        assert_eq!(restored.turn, 1);
307        assert_eq!(restored.total_tokens, 100);
308    }
309
310    #[test]
311    fn context_snapshot_captures_fields() {
312        let ctx = ContextSnapshot {
313            system_messages: vec![Message::system("You are helpful")],
314            task_goal: Some("Build something".to_string()),
315            ..Default::default()
316        };
317
318        assert_eq!(ctx.system_messages.len(), 1);
319        assert_eq!(ctx.task_goal.as_deref(), Some("Build something"));
320    }
321
322    #[test]
323    fn snapshot_from_state_captures_tasks() {
324        let mut table = TaskTable::new();
325        table.insert(Tcb::root("root", SchedulerBudget::default()));
326        table.insert(Tcb::root("child", SchedulerBudget::default()));
327
328        let ctx = ContextSnapshot::default();
329        let snap = KernelSnapshot::from_state(5, 1000, &table, &ctx, None);
330
331        assert_eq!(snap.turn, 5);
332        assert_eq!(snap.total_tokens, 1000);
333        assert_eq!(snap.tasks.len(), 2);
334    }
335
336    #[test]
337    fn context_snapshot_from_manager() {
338        use crate::context::manager::ContextManager;
339        use crate::types::message::Message;
340
341        let mut ctx = ContextManager::new(1000);
342        ctx.partitions.system.push(Message::system("You are helpful"), 10);
343        ctx.partitions.task_state.goal = "Test goal".to_string();
344
345        let snap = ContextSnapshot::from_context(&ctx);
346        assert_eq!(snap.system_messages.len(), 1);
347        assert_eq!(snap.task_goal.as_deref(), Some("Test goal"));
348        assert_eq!(snap.max_tokens, 1000);
349        // Empty plan becomes None
350        assert!(snap.task_plan.is_none());
351    }
352
353    #[test]
354    fn tcb_restore_roundtrip() {
355        let original = Tcb::root("test-task", SchedulerBudget {
356            max_tokens: 128_000,
357            max_turns: 10,
358            max_total_tokens: 1000,
359            max_wall_ms: Some(60000),
360        });
361
362        let snap = TcbSnapshot::from(&original);
363        let kernel_snap = KernelSnapshot {
364            turn: 1,
365            total_tokens: 100,
366            tasks: vec![snap.clone()],
367            context: ContextSnapshot::default(),
368            run_spec: None,
369        };
370
371        let restored = kernel_snap.restore_tcb(&snap);
372        assert!(restored.is_some());
373
374        let tcb = restored.unwrap();
375        assert_eq!(tcb.id.as_str(), "test-task");
376        assert_eq!(tcb.state, original.state);
377        assert_eq!(tcb.budget.turns, original.budget.turns);
378    }
379
380    #[test]
381    fn tcb_restore_with_wait_reason() {
382        let mut tcb = Tcb::root("waiting-task", SchedulerBudget::default());
383        tcb.wait = Some(crate::scheduler::tcb::WaitReason::SubAgentJoin(
384            vec!["child-1".into(), "child-2".into()]
385        ));
386
387        let snap = TcbSnapshot::from(&tcb);
388        let kernel_snap = KernelSnapshot {
389            turn: 1,
390            total_tokens: 100,
391            tasks: vec![snap.clone()],
392            context: ContextSnapshot::default(),
393            run_spec: None,
394        };
395
396        let restored = kernel_snap.restore_tcb(&snap).expect("restore should succeed");
397        match restored.wait {
398            Some(crate::scheduler::tcb::WaitReason::SubAgentJoin(children)) => {
399                assert_eq!(children.len(), 2);
400                assert_eq!(children[0].as_str(), "child-1");
401                assert_eq!(children[1].as_str(), "child-2");
402            }
403            other => panic!("Expected SubAgentJoin, got {:?}", other),
404        }
405    }
406
407    #[test]
408    fn kernel_snapshot_to_os_process_records() {
409        // Create a snapshot with root + sub-agent tasks
410        let snap = KernelSnapshot {
411            turn: 5,
412            total_tokens: 1000,
413            tasks: vec![
414                // Root task (no ProcInfo - should be filtered out)
415                TcbSnapshot {
416                    id: "root".into(),
417                    parent: None,
418                    state: TaskState::Running,
419                    turns: 5,
420                    total_tokens: 500,
421                    started_at_ms: Some(0),
422                    max_tokens: 128_000,
423                    max_turns: 100,
424                    max_total_tokens: 1_000_000,
425                    max_wall_ms: None,
426                    wait_reason: None,
427                    wait_children: None,
428                    deferred_until: None,
429                    caps: vec![],
430                    proc: None,
431                },
432                // Sub-agent task (has ProcInfo - should be included)
433                TcbSnapshot {
434                    id: "child-1".into(),
435                    parent: Some("root".into()),
436                    state: TaskState::Done(crate::types::result::TerminationReason::Completed),
437                    turns: 3,
438                    total_tokens: 300,
439                    started_at_ms: Some(100),
440                    max_tokens: 64_000,
441                    max_turns: 50,
442                    max_total_tokens: 500_000,
443                    max_wall_ms: None,
444                    wait_reason: None,
445                    wait_children: None,
446                    deferred_until: None,
447                    caps: vec![],
448                    proc: Some(ProcInfoSnapshot {
449                        parent_session_id: "root".into(),
450                        role: AgentRole::Implement,
451                        isolation: AgentIsolation::Shared,
452                        context_inheritance: ContextInheritance::None,
453                        result: None,
454                    }),
455                },
456            ],
457            context: ContextSnapshot::default(),
458            run_spec: None,
459        };
460
461        let records = snap.to_os_process_records();
462        assert_eq!(records.len(), 1); // Root task filtered out
463        assert_eq!(records[0].agent_id, "child-1");
464        assert_eq!(records[0].parent_session_id, "root");
465        assert_eq!(records[0].turn, 3);
466        assert_eq!(records[0].state, "done"); // Done state -> "done" label
467    }
468
469    #[test]
470    fn kernel_snapshot_to_os_records_matches_state_machine() {
471        use crate::scheduler::state_machine::LoopStateMachine;
472        use crate::scheduler::policy::LoopPolicy;
473        use crate::types::agent::{AgentIdentity, AgentRole, AgentRunSpec};
474
475        // Create a state machine and spawn a sub-agent
476        let mut sm = LoopStateMachine::new(LoopPolicy {
477            max_tokens: 128_000,
478            ..Default::default()
479        });
480        sm.start(crate::types::task::RuntimeTask::new("parent task"));
481
482        // Spawn sub-agent
483        let _ = sm.spawn_sub_agent(
484            AgentRunSpec::new(
485                AgentIdentity::sub_agent("child", "child-session"),
486                AgentRole::Implement,
487                "child task",
488            ),
489            "parent-sess",
490        );
491
492        // Take snapshot and convert to OsSnapshot records
493        let snap = sm.snapshot();
494        let records = snap.to_os_process_records();
495
496        // Should have exactly one sub-agent record
497        assert_eq!(records.len(), 1);
498        assert_eq!(records[0].agent_id, "child");
499        assert_eq!(records[0].parent_session_id, "parent-sess");
500        // State should be "running" or "suspended" (depends on when snapshot was taken)
501        assert!(
502            records[0].state == "running" || records[0].state == "suspended",
503            "unexpected state: {}",
504            records[0].state
505        );
506    }
507}