1use std::collections::HashMap;
2
3use serde_json::json;
4use serde_json::Value;
5use tokio::fs;
6use uuid::Uuid;
7
8use crate::audit::append_protected_audit_event;
9use crate::automation_v2::governance::*;
10use crate::{now_ms, AppState};
11
12const GOVERNANCE_AUDIT_EVENT_PREFIX: &str = "automation.governance";
13
14fn default_human_provenance(
15 creator_id: Option<String>,
16 source: impl Into<String>,
17) -> AutomationProvenanceRecord {
18 AutomationProvenanceRecord::human(creator_id, source)
19}
20
21fn declared_capabilities_for_automation(
22 automation: &crate::AutomationV2Spec,
23) -> AutomationDeclaredCapabilities {
24 AutomationDeclaredCapabilities::from_metadata(automation.metadata.as_ref())
25}
26
27impl AppState {
28 pub async fn load_automation_governance(&self) -> anyhow::Result<()> {
29 if !self.automation_governance_path.exists() {
30 return Ok(());
31 }
32 let raw = fs::read_to_string(&self.automation_governance_path).await?;
33 let parsed = serde_json::from_str::<GovernanceState>(&raw).unwrap_or_default();
34 *self.automation_governance.write().await = parsed;
35 Ok(())
36 }
37
38 pub async fn persist_automation_governance(&self) -> anyhow::Result<()> {
39 if let Some(parent) = self.automation_governance_path.parent() {
40 fs::create_dir_all(parent).await?;
41 }
42 let payload = {
43 let guard = self.automation_governance.read().await;
44 serde_json::to_string_pretty(&*guard)?
45 };
46 fs::write(&self.automation_governance_path, payload).await?;
47 Ok(())
48 }
49
50 async fn persist_automation_governance_locked(&self) -> anyhow::Result<()> {
51 self.persist_automation_governance().await
52 }
53
54 pub async fn bootstrap_automation_governance(&self) -> anyhow::Result<usize> {
55 let automations = self.list_automations_v2().await;
56 let now = now_ms();
57 let mut inserted = 0usize;
58 {
59 let mut guard = self.automation_governance.write().await;
60 for automation in automations {
61 if guard.records.contains_key(&automation.automation_id) {
62 continue;
63 }
64 guard.records.insert(
65 automation.automation_id.clone(),
66 AutomationGovernanceRecord {
67 automation_id: automation.automation_id.clone(),
68 provenance: default_human_provenance(
69 Some(automation.creator_id.clone()),
70 "migration_or_legacy_default",
71 ),
72 declared_capabilities: declared_capabilities_for_automation(&automation),
73 modify_grants: Vec::new(),
74 capability_grants: Vec::new(),
75 created_at_ms: automation.created_at_ms.max(now),
76 updated_at_ms: now,
77 deleted_at_ms: None,
78 delete_retention_until_ms: None,
79 published_externally: false,
80 creation_paused: false,
81 review_required: false,
82 review_kind: None,
83 review_requested_at_ms: None,
84 review_request_id: None,
85 last_reviewed_at_ms: None,
86 runs_since_review: 0,
87 expires_at_ms: None,
88 expired_at_ms: None,
89 retired_at_ms: None,
90 retire_reason: None,
91 paused_for_lifecycle: false,
92 health_last_checked_at_ms: None,
93 health_findings: Vec::new(),
94 },
95 );
96 inserted += 1;
97 }
98 guard.updated_at_ms = now;
99 }
100 if inserted > 0 {
101 self.persist_automation_governance().await?;
102 }
103 Ok(inserted)
104 }
105
106 pub async fn get_automation_governance(
107 &self,
108 automation_id: &str,
109 ) -> Option<AutomationGovernanceRecord> {
110 self.automation_governance
111 .read()
112 .await
113 .records
114 .get(automation_id)
115 .cloned()
116 }
117
118 pub async fn get_or_bootstrap_automation_governance(
119 &self,
120 automation: &crate::AutomationV2Spec,
121 ) -> AutomationGovernanceRecord {
122 if let Some(record) = self
123 .get_automation_governance(&automation.automation_id)
124 .await
125 {
126 return record;
127 }
128 let record = AutomationGovernanceRecord {
129 automation_id: automation.automation_id.clone(),
130 provenance: default_human_provenance(
131 Some(automation.creator_id.clone()),
132 "legacy_default",
133 ),
134 declared_capabilities: declared_capabilities_for_automation(automation),
135 modify_grants: Vec::new(),
136 capability_grants: Vec::new(),
137 created_at_ms: automation.created_at_ms,
138 updated_at_ms: now_ms(),
139 deleted_at_ms: None,
140 delete_retention_until_ms: None,
141 published_externally: false,
142 creation_paused: false,
143 review_required: false,
144 review_kind: None,
145 review_requested_at_ms: None,
146 review_request_id: None,
147 last_reviewed_at_ms: None,
148 runs_since_review: 0,
149 expires_at_ms: None,
150 expired_at_ms: None,
151 retired_at_ms: None,
152 retire_reason: None,
153 paused_for_lifecycle: false,
154 health_last_checked_at_ms: None,
155 health_findings: Vec::new(),
156 };
157 let _ = self.upsert_automation_governance(record.clone()).await;
158 record
159 }
160
161 pub async fn upsert_automation_governance(
162 &self,
163 mut record: AutomationGovernanceRecord,
164 ) -> anyhow::Result<AutomationGovernanceRecord> {
165 if record.automation_id.trim().is_empty() {
166 anyhow::bail!("automation_id is required");
167 }
168 let now = now_ms();
169 if record.created_at_ms == 0 {
170 record.created_at_ms = now;
171 }
172 record.updated_at_ms = now;
173 {
174 let mut guard = self.automation_governance.write().await;
175 guard
176 .records
177 .insert(record.automation_id.clone(), record.clone());
178 guard.updated_at_ms = now;
179 }
180 self.persist_automation_governance().await?;
181 let _ = append_protected_audit_event(
182 self,
183 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.record.updated"),
184 &tandem_types::TenantContext::local_implicit(),
185 record
186 .provenance
187 .creator
188 .actor_id
189 .clone()
190 .or_else(|| record.provenance.creator.source.clone()),
191 json!({
192 "automationID": record.automation_id,
193 "provenance": record.provenance,
194 "declaredCapabilities": record.declared_capabilities,
195 "publishedExternally": record.published_externally,
196 "creationPaused": record.creation_paused,
197 }),
198 )
199 .await;
200 Ok(record)
201 }
202
203 pub async fn set_automation_governance_provenance(
204 &self,
205 automation_id: &str,
206 provenance: AutomationProvenanceRecord,
207 ) -> anyhow::Result<AutomationGovernanceRecord> {
208 let mut record = self
209 .get_automation_governance(automation_id)
210 .await
211 .unwrap_or_else(|| AutomationGovernanceRecord {
212 automation_id: automation_id.to_string(),
213 provenance: provenance.clone(),
214 declared_capabilities: AutomationDeclaredCapabilities::default(),
215 modify_grants: Vec::new(),
216 capability_grants: Vec::new(),
217 created_at_ms: now_ms(),
218 updated_at_ms: now_ms(),
219 deleted_at_ms: None,
220 delete_retention_until_ms: None,
221 published_externally: false,
222 creation_paused: false,
223 review_required: false,
224 review_kind: None,
225 review_requested_at_ms: None,
226 review_request_id: None,
227 last_reviewed_at_ms: None,
228 runs_since_review: 0,
229 expires_at_ms: None,
230 expired_at_ms: None,
231 retired_at_ms: None,
232 retire_reason: None,
233 paused_for_lifecycle: false,
234 health_last_checked_at_ms: None,
235 health_findings: Vec::new(),
236 });
237 record.provenance = provenance;
238 if record.expires_at_ms.is_none()
239 && record.provenance.creator.kind == GovernanceActorKind::Agent
240 {
241 let default_expires_after_ms = self
242 .automation_governance
243 .read()
244 .await
245 .limits
246 .default_expires_after_ms;
247 if default_expires_after_ms > 0 {
248 record.expires_at_ms = Some(now_ms().saturating_add(default_expires_after_ms));
249 }
250 }
251 let stored = self.upsert_automation_governance(record).await?;
252 if let Some(agent_id) = stored
253 .provenance
254 .creator
255 .actor_id
256 .as_deref()
257 .filter(|_| stored.provenance.creator.kind == GovernanceActorKind::Agent)
258 {
259 let _ = self
260 .record_agent_creation_review_progress(agent_id, &stored.automation_id)
261 .await;
262 }
263 Ok(stored)
264 }
265
266 pub async fn sync_automation_governance_from_spec(
267 &self,
268 automation: &crate::AutomationV2Spec,
269 provenance: Option<AutomationProvenanceRecord>,
270 ) -> anyhow::Result<AutomationGovernanceRecord> {
271 let now = now_ms();
272 let mut record = self
273 .get_automation_governance(&automation.automation_id)
274 .await
275 .unwrap_or_else(|| AutomationGovernanceRecord {
276 automation_id: automation.automation_id.clone(),
277 provenance: provenance.clone().unwrap_or_else(|| {
278 default_human_provenance(Some(automation.creator_id.clone()), "sync_default")
279 }),
280 declared_capabilities: declared_capabilities_for_automation(automation),
281 modify_grants: Vec::new(),
282 capability_grants: Vec::new(),
283 created_at_ms: automation.created_at_ms,
284 updated_at_ms: now,
285 deleted_at_ms: None,
286 delete_retention_until_ms: None,
287 published_externally: false,
288 creation_paused: false,
289 review_required: false,
290 review_kind: None,
291 review_requested_at_ms: None,
292 review_request_id: None,
293 last_reviewed_at_ms: None,
294 runs_since_review: 0,
295 expires_at_ms: None,
296 expired_at_ms: None,
297 retired_at_ms: None,
298 retire_reason: None,
299 paused_for_lifecycle: false,
300 health_last_checked_at_ms: None,
301 health_findings: Vec::new(),
302 });
303 if let Some(provenance) = provenance {
304 record.provenance = provenance;
305 }
306 record.declared_capabilities = declared_capabilities_for_automation(automation);
307 if record.created_at_ms == 0 {
308 record.created_at_ms = automation.created_at_ms;
309 }
310 record.updated_at_ms = now;
311 {
312 let mut guard = self.automation_governance.write().await;
313 guard
314 .records
315 .insert(record.automation_id.clone(), record.clone());
316 guard.updated_at_ms = now;
317 }
318 self.persist_automation_governance().await?;
319 if let Some(agent_id) = record
320 .provenance
321 .creator
322 .actor_id
323 .as_deref()
324 .filter(|_| record.provenance.creator.kind == GovernanceActorKind::Agent)
325 {
326 let _ = self
327 .record_agent_creation_review_progress(agent_id, &record.automation_id)
328 .await;
329 }
330 Ok(record)
331 }
332
333 pub async fn pause_automation_creation_for_agent(
334 &self,
335 agent_id: &str,
336 paused: bool,
337 ) -> anyhow::Result<()> {
338 let mut guard = self.automation_governance.write().await;
339 if paused {
340 if !guard.paused_agents.iter().any(|value| value == agent_id) {
341 guard.paused_agents.push(agent_id.to_string());
342 }
343 } else {
344 guard.paused_agents.retain(|value| value != agent_id);
345 }
346 guard.updated_at_ms = now_ms();
347 drop(guard);
348 self.persist_automation_governance().await?;
349 Ok(())
350 }
351
352 pub async fn can_create_automation_for_actor(
353 &self,
354 actor: &GovernanceActorRef,
355 provenance: &AutomationProvenanceRecord,
356 declared_capabilities: &AutomationDeclaredCapabilities,
357 ) -> Result<(), GovernanceError> {
358 let guard = self.automation_governance.read().await;
359 let limits = &guard.limits;
360 if !limits.creation_enabled {
361 return Err(GovernanceError::forbidden(
362 "AUTOMATION_V2_CREATION_DISABLED",
363 "agent automation creation is disabled for this tenant",
364 ));
365 }
366 if matches!(actor.kind, GovernanceActorKind::Agent) {
367 let agent_id = actor.actor_id.as_deref().unwrap_or_default();
368 if agent_id.is_empty() {
369 return Err(GovernanceError::forbidden(
370 "AUTOMATION_V2_AGENT_ID_REQUIRED",
371 "agent automation creation requires an agent identifier",
372 ));
373 }
374 if guard.is_agent_paused(agent_id) {
375 return Err(GovernanceError::forbidden(
376 "AUTOMATION_V2_AGENT_CREATION_PAUSED",
377 "this agent is paused from creating automations",
378 ));
379 }
380 if guard.is_agent_spend_paused(agent_id)
381 && !guard.has_approved_agent_quota_override(agent_id)
382 {
383 return Err(GovernanceError::too_many_requests(
384 "AUTOMATION_V2_AGENT_SPEND_CAP_EXCEEDED",
385 "this agent is paused after reaching its spend cap",
386 ));
387 }
388 if guard
389 .agent_creation_reviews
390 .get(agent_id)
391 .is_some_and(|summary| summary.review_required)
392 {
393 return Err(GovernanceError::too_many_requests(
394 "AUTOMATION_V2_AGENT_REVIEW_REQUIRED",
395 format!(
396 "agent {} must be reviewed before creating additional automations",
397 agent_id
398 ),
399 ));
400 }
401 self.validate_declared_capabilities_for_agent(
402 &guard,
403 agent_id,
404 declared_capabilities,
405 None,
406 )?;
407 if provenance.depth > limits.lineage_depth_limit {
408 return Err(GovernanceError::forbidden(
409 "AUTOMATION_V2_LINEAGE_DEPTH_EXCEEDED",
410 format!(
411 "lineage depth {} exceeds configured limit {}",
412 provenance.depth, limits.lineage_depth_limit
413 ),
414 ));
415 }
416 let window_start = now_ms().saturating_sub(24 * 60 * 60 * 1000);
417 let created_today = guard
418 .records
419 .values()
420 .filter(|record| {
421 record.deleted_at_ms.is_none()
422 && record.provenance.creator.kind == GovernanceActorKind::Agent
423 && record
424 .provenance
425 .creator
426 .actor_id
427 .as_deref()
428 .is_some_and(|value| value == agent_id)
429 && record.created_at_ms >= window_start
430 })
431 .count() as u64;
432 if created_today >= limits.per_agent_daily_creation_limit {
433 return Err(GovernanceError::too_many_requests(
434 "AUTOMATION_V2_AGENT_DAILY_QUOTA_EXCEEDED",
435 format!(
436 "agent {} has reached the daily automation creation quota",
437 agent_id
438 ),
439 ));
440 }
441 let active_agent_created = guard
442 .records
443 .values()
444 .filter(|record| {
445 record.deleted_at_ms.is_none()
446 && record.provenance.creator.kind == GovernanceActorKind::Agent
447 })
448 .count() as u64;
449 if active_agent_created >= limits.active_agent_automation_cap {
450 return Err(GovernanceError::too_many_requests(
451 "AUTOMATION_V2_AGENT_CAP_EXCEEDED",
452 "tenant has reached the active agent-authored automation cap",
453 ));
454 }
455 }
456 Ok(())
457 }
458
459 fn validate_declared_capabilities_for_agent(
460 &self,
461 guard: &GovernanceState,
462 agent_id: &str,
463 declared_capabilities: &AutomationDeclaredCapabilities,
464 previous_capabilities: Option<&AutomationDeclaredCapabilities>,
465 ) -> Result<(), GovernanceError> {
466 let previous = previous_capabilities.cloned().unwrap_or_default();
467 for capability in declared_capabilities.escalates_from(&previous) {
468 if !guard.has_approved_agent_capability(agent_id, capability) {
469 return Err(GovernanceError::forbidden(
470 "AUTOMATION_V2_CAPABILITY_ESCALATION_FORBIDDEN",
471 format!(
472 "agent {} lacks approval for capability {}",
473 agent_id, capability
474 ),
475 ));
476 }
477 }
478 Ok(())
479 }
480
481 pub async fn can_escalate_declared_capabilities(
482 &self,
483 actor: &GovernanceActorRef,
484 previous: &AutomationDeclaredCapabilities,
485 next: &AutomationDeclaredCapabilities,
486 ) -> Result<(), GovernanceError> {
487 if matches!(actor.kind, GovernanceActorKind::Human) {
488 return Ok(());
489 }
490 let Some(agent_id) = actor.actor_id.as_deref() else {
491 return Err(GovernanceError::forbidden(
492 "AUTOMATION_V2_AGENT_ID_REQUIRED",
493 "agent automation requests require an agent identifier",
494 ));
495 };
496 let guard = self.automation_governance.read().await;
497 self.validate_declared_capabilities_for_agent(&guard, agent_id, next, Some(previous))
498 }
499
500 pub async fn can_mutate_automation(
501 &self,
502 automation_id: &str,
503 actor: &GovernanceActorRef,
504 destructive: bool,
505 ) -> Result<AutomationGovernanceRecord, GovernanceError> {
506 let guard = self.automation_governance.read().await;
507 let Some(record) = guard.records.get(automation_id).cloned() else {
508 return Err(GovernanceError::forbidden(
509 "AUTOMATION_V2_GOVERNANCE_MISSING",
510 "automation governance record not found",
511 ));
512 };
513 if matches!(actor.kind, GovernanceActorKind::Human) {
514 return Ok(record);
515 }
516 let Some(actor_id) = actor.actor_id.as_deref() else {
517 return Err(GovernanceError::forbidden(
518 "AUTOMATION_V2_AGENT_ID_REQUIRED",
519 "agent automation requests require an agent identifier",
520 ));
521 };
522 if record.retired_at_ms.is_some() {
523 return Err(GovernanceError::forbidden(
524 "AUTOMATION_V2_RETIRED",
525 "retired automations are not mutable by agents",
526 ));
527 }
528 if record.expired_at_ms.is_some() && record.paused_for_lifecycle {
529 return Err(GovernanceError::forbidden(
530 "AUTOMATION_V2_EXPIRED",
531 "expired automations are paused pending human review",
532 ));
533 }
534 if record.paused_for_lifecycle {
535 return Err(GovernanceError::forbidden(
536 "AUTOMATION_V2_LIFECYCLE_PAUSED",
537 "paused automations are not mutable by agents",
538 ));
539 }
540 if destructive {
541 if record.provenance.creator.kind != GovernanceActorKind::Agent {
542 return Err(GovernanceError::forbidden(
543 "AUTOMATION_V2_DELETE_HUMAN_CREATED_DENIED",
544 "agents cannot delete human-created automations",
545 ));
546 }
547 if record.provenance.creator.actor_id.as_deref() != Some(actor_id) {
548 return Err(GovernanceError::forbidden(
549 "AUTOMATION_V2_DELETE_NOT_OWNER",
550 "agents can only delete automations they created",
551 ));
552 }
553 return Ok(record);
554 }
555 if record.provenance.creator.kind == GovernanceActorKind::Agent
556 && record.provenance.creator.actor_id.as_deref() == Some(actor_id)
557 {
558 return Ok(record);
559 }
560 if record.has_modify_grant(actor_id) {
561 return Ok(record);
562 }
563 Err(GovernanceError::forbidden(
564 "AUTOMATION_V2_MODIFY_FORBIDDEN",
565 "agent lacks modify rights for this automation",
566 ))
567 }
568
569 pub async fn record_automation_creation(
570 &self,
571 automation: &crate::AutomationV2Spec,
572 provenance: AutomationProvenanceRecord,
573 ) -> anyhow::Result<AutomationGovernanceRecord> {
574 let mut record = AutomationGovernanceRecord {
575 automation_id: automation.automation_id.clone(),
576 provenance,
577 declared_capabilities: declared_capabilities_for_automation(automation),
578 modify_grants: Vec::new(),
579 capability_grants: Vec::new(),
580 created_at_ms: automation.created_at_ms,
581 updated_at_ms: now_ms(),
582 deleted_at_ms: None,
583 delete_retention_until_ms: None,
584 published_externally: false,
585 creation_paused: false,
586 review_required: false,
587 review_kind: None,
588 review_requested_at_ms: None,
589 review_request_id: None,
590 last_reviewed_at_ms: None,
591 runs_since_review: 0,
592 expires_at_ms: None,
593 expired_at_ms: None,
594 retired_at_ms: None,
595 retire_reason: None,
596 paused_for_lifecycle: false,
597 health_last_checked_at_ms: None,
598 health_findings: Vec::new(),
599 };
600 if record.expires_at_ms.is_none()
601 && record.provenance.creator.kind == GovernanceActorKind::Agent
602 {
603 let default_expires_after_ms = self
604 .automation_governance
605 .read()
606 .await
607 .limits
608 .default_expires_after_ms;
609 if default_expires_after_ms > 0 {
610 record.expires_at_ms = Some(now_ms().saturating_add(default_expires_after_ms));
611 }
612 }
613 let stored = self.upsert_automation_governance(record).await?;
614 if let Some(agent_id) = stored
615 .provenance
616 .creator
617 .actor_id
618 .as_deref()
619 .filter(|_| stored.provenance.creator.kind == GovernanceActorKind::Agent)
620 {
621 let _ = self
622 .record_agent_creation_review_progress(agent_id, &stored.automation_id)
623 .await;
624 }
625 Ok(stored)
626 }
627
628 pub async fn grant_automation_modify_access(
629 &self,
630 automation_id: &str,
631 granted_to: GovernanceActorRef,
632 granted_by: GovernanceActorRef,
633 reason: Option<String>,
634 ) -> anyhow::Result<AutomationGrantRecord> {
635 let grant = {
636 let mut guard = self.automation_governance.write().await;
637 let grant = {
638 let Some(record) = guard.records.get_mut(automation_id) else {
639 anyhow::bail!("automation governance record not found");
640 };
641 let grant = AutomationGrantRecord {
642 grant_id: format!("grant-{}", Uuid::new_v4()),
643 automation_id: automation_id.to_string(),
644 grant_kind: AutomationGrantKind::Modify,
645 granted_to,
646 granted_by,
647 capability_key: None,
648 created_at_ms: now_ms(),
649 revoked_at_ms: None,
650 revoke_reason: reason,
651 };
652 record.modify_grants.push(grant.clone());
653 record.updated_at_ms = now_ms();
654 grant
655 };
656 guard.updated_at_ms = now_ms();
657 grant
658 };
659 self.persist_automation_governance().await?;
660 let _ = append_protected_audit_event(
661 self,
662 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.grant.created"),
663 &tandem_types::TenantContext::local_implicit(),
664 grant
665 .granted_by
666 .actor_id
667 .clone()
668 .or_else(|| grant.granted_by.source.clone()),
669 json!({
670 "automationID": automation_id,
671 "grant": grant,
672 }),
673 )
674 .await;
675 Ok(grant)
676 }
677
678 pub async fn revoke_automation_modify_access(
679 &self,
680 automation_id: &str,
681 grant_id: &str,
682 revoked_by: GovernanceActorRef,
683 reason: Option<String>,
684 ) -> anyhow::Result<Option<AutomationGrantRecord>> {
685 let stored = {
686 let mut guard = self.automation_governance.write().await;
687 let stored = {
688 let Some(record) = guard.records.get_mut(automation_id) else {
689 anyhow::bail!("automation governance record not found");
690 };
691 let Some(grant) = record
692 .modify_grants
693 .iter_mut()
694 .find(|grant| grant.grant_id == grant_id && grant.revoked_at_ms.is_none())
695 else {
696 return Ok(None);
697 };
698 grant.revoked_at_ms = Some(now_ms());
699 grant.revoke_reason = reason.clone();
700 record.updated_at_ms = now_ms();
701 grant.clone()
702 };
703 guard.updated_at_ms = now_ms();
704 stored
705 };
706 self.persist_automation_governance().await?;
707 let _ = append_protected_audit_event(
708 self,
709 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.grant.revoked"),
710 &tandem_types::TenantContext::local_implicit(),
711 revoked_by
712 .actor_id
713 .clone()
714 .or_else(|| revoked_by.source.clone()),
715 json!({
716 "automationID": automation_id,
717 "grantID": grant_id,
718 "reason": reason,
719 }),
720 )
721 .await;
722 Ok(Some(stored))
723 }
724
725 pub async fn request_approval(
726 &self,
727 request_type: GovernanceApprovalRequestType,
728 requested_by: GovernanceActorRef,
729 target_resource: GovernanceResourceRef,
730 rationale: String,
731 context: Value,
732 expires_at_ms: Option<u64>,
733 ) -> anyhow::Result<GovernanceApprovalRequest> {
734 let now = now_ms();
735 let approval_ttl_ms = self
736 .automation_governance
737 .read()
738 .await
739 .limits
740 .approval_ttl_ms;
741 let expires_at_ms = expires_at_ms.unwrap_or_else(|| now.saturating_add(approval_ttl_ms));
742 let request = GovernanceApprovalRequest {
743 approval_id: format!("apr_{}", Uuid::new_v4().simple()),
744 request_type,
745 requested_by,
746 target_resource,
747 rationale,
748 context,
749 status: GovernanceApprovalStatus::Pending,
750 expires_at_ms,
751 reviewed_by: None,
752 reviewed_at_ms: None,
753 review_notes: None,
754 created_at_ms: now,
755 updated_at_ms: now,
756 };
757 {
758 let mut guard = self.automation_governance.write().await;
759 guard
760 .approvals
761 .insert(request.approval_id.clone(), request.clone());
762 guard.updated_at_ms = now;
763 }
764 self.persist_automation_governance().await?;
765 let _ = append_protected_audit_event(
766 self,
767 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.approval.requested"),
768 &tandem_types::TenantContext::local_implicit(),
769 request
770 .requested_by
771 .actor_id
772 .clone()
773 .or_else(|| request.requested_by.source.clone()),
774 json!({
775 "approvalID": request.approval_id,
776 "request": request,
777 }),
778 )
779 .await;
780 Ok(request)
781 }
782
783 pub async fn list_approval_requests(
784 &self,
785 request_type: Option<GovernanceApprovalRequestType>,
786 status: Option<GovernanceApprovalStatus>,
787 ) -> Vec<GovernanceApprovalRequest> {
788 let mut rows = self
789 .automation_governance
790 .read()
791 .await
792 .approvals
793 .values()
794 .filter(|request| {
795 request_type
796 .map(|value| request.request_type == value)
797 .unwrap_or(true)
798 && status.map(|value| request.status == value).unwrap_or(true)
799 })
800 .cloned()
801 .collect::<Vec<_>>();
802 rows.sort_by(|a, b| b.updated_at_ms.cmp(&a.updated_at_ms));
803 rows
804 }
805
806 pub async fn decide_approval_request(
807 &self,
808 approval_id: &str,
809 reviewer: GovernanceActorRef,
810 approved: bool,
811 notes: Option<String>,
812 ) -> anyhow::Result<Option<GovernanceApprovalRequest>> {
813 let stored = {
814 let mut guard = self.automation_governance.write().await;
815 let stored = {
816 let Some(request) = guard.approvals.get_mut(approval_id) else {
817 return Ok(None);
818 };
819 if request.status != GovernanceApprovalStatus::Pending {
820 return Ok(Some(request.clone()));
821 }
822 let now = now_ms();
823 request.status = if approved {
824 GovernanceApprovalStatus::Approved
825 } else {
826 GovernanceApprovalStatus::Denied
827 };
828 request.reviewed_by = Some(reviewer.clone());
829 request.reviewed_at_ms = Some(now);
830 request.review_notes = notes.clone();
831 request.updated_at_ms = now;
832 request.clone()
833 };
834 guard.updated_at_ms = now_ms();
835 stored
836 };
837 self.persist_automation_governance().await?;
838 let _ = append_protected_audit_event(
839 self,
840 format!(
841 "{GOVERNANCE_AUDIT_EVENT_PREFIX}.approval.{}",
842 if approved { "approved" } else { "denied" }
843 ),
844 &tandem_types::TenantContext::local_implicit(),
845 reviewer
846 .actor_id
847 .clone()
848 .or_else(|| reviewer.source.clone()),
849 json!({
850 "approvalID": approval_id,
851 "approval": stored,
852 }),
853 )
854 .await;
855 Ok(Some(stored))
856 }
857
858 pub async fn delete_automation_v2_with_governance(
859 &self,
860 automation_id: &str,
861 deleted_by: GovernanceActorRef,
862 ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
863 let _guard = self.automations_v2_persistence.lock().await;
864 let removed = self.automations_v2.write().await.remove(automation_id);
865 if let Some(automation) = removed.clone() {
866 let now = now_ms();
867 {
868 let mut governance = self.automation_governance.write().await;
869 let record = governance
870 .records
871 .entry(automation_id.to_string())
872 .or_insert_with(|| AutomationGovernanceRecord {
873 automation_id: automation_id.to_string(),
874 provenance: default_human_provenance(
875 Some(automation.creator_id.clone()),
876 "delete_default",
877 ),
878 declared_capabilities: declared_capabilities_for_automation(&automation),
879 modify_grants: Vec::new(),
880 capability_grants: Vec::new(),
881 created_at_ms: automation.created_at_ms,
882 updated_at_ms: now,
883 deleted_at_ms: None,
884 delete_retention_until_ms: None,
885 published_externally: false,
886 creation_paused: false,
887 review_required: false,
888 review_kind: None,
889 review_requested_at_ms: None,
890 review_request_id: None,
891 last_reviewed_at_ms: None,
892 runs_since_review: 0,
893 expires_at_ms: None,
894 expired_at_ms: None,
895 retired_at_ms: None,
896 retire_reason: None,
897 paused_for_lifecycle: false,
898 health_last_checked_at_ms: None,
899 health_findings: Vec::new(),
900 });
901 record.deleted_at_ms = Some(now);
902 record.delete_retention_until_ms =
903 Some(now.saturating_add(7 * 24 * 60 * 60 * 1000));
904 record.updated_at_ms = now;
905 governance.deleted_automations.insert(
906 automation_id.to_string(),
907 DeletedAutomationRecord {
908 automation: automation.clone(),
909 deleted_at_ms: now,
910 deleted_by: deleted_by.clone(),
911 restore_until_ms: now.saturating_add(7 * 24 * 60 * 60 * 1000),
912 },
913 );
914 governance.updated_at_ms = now;
915 }
916 self.persist_automation_governance().await?;
917 self.persist_automations_v2_locked().await?;
918 let _ = append_protected_audit_event(
919 self,
920 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.deleted"),
921 &tandem_types::TenantContext::local_implicit(),
922 deleted_by
923 .actor_id
924 .clone()
925 .or_else(|| deleted_by.source.clone()),
926 json!({
927 "automationID": automation_id,
928 "deletedBy": deleted_by,
929 "deletedAtMs": now,
930 }),
931 )
932 .await;
933 }
934 Ok(removed)
935 }
936
937 pub async fn restore_deleted_automation_v2(
938 &self,
939 automation_id: &str,
940 ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
941 let restored = {
942 let mut governance = self.automation_governance.write().await;
943 let Some(deleted) = governance.deleted_automations.remove(automation_id) else {
944 return Ok(None);
945 };
946 let automation = deleted.automation.clone();
947 self.automations_v2
948 .write()
949 .await
950 .insert(automation_id.to_string(), automation.clone());
951 if let Some(record) = governance.records.get_mut(automation_id) {
952 record.deleted_at_ms = None;
953 record.delete_retention_until_ms = None;
954 record.updated_at_ms = now_ms();
955 }
956 governance.updated_at_ms = now_ms();
957 automation
958 };
959 self.persist_automation_governance().await?;
960 self.persist_automations_v2().await?;
961 Ok(Some(restored))
962 }
963
964 pub async fn agent_spend_summary(&self, agent_id: &str) -> Option<AgentSpendSummary> {
965 self.automation_governance
966 .read()
967 .await
968 .agent_spend_summary(agent_id)
969 }
970
971 pub async fn list_agent_spend_summaries(&self) -> Vec<AgentSpendSummary> {
972 self.automation_governance
973 .read()
974 .await
975 .agent_spend_summaries()
976 }
977
978 pub async fn agent_creation_review_summary(
979 &self,
980 agent_id: &str,
981 ) -> Option<AgentCreationReviewSummary> {
982 self.automation_governance
983 .read()
984 .await
985 .agent_creation_review_summary(agent_id)
986 }
987
988 pub async fn list_agent_creation_review_summaries(&self) -> Vec<AgentCreationReviewSummary> {
989 self.automation_governance
990 .read()
991 .await
992 .agent_creation_review_summaries()
993 }
994
995 pub async fn record_agent_creation_review_progress(
996 &self,
997 agent_id: &str,
998 automation_id: &str,
999 ) -> anyhow::Result<()> {
1000 let now = now_ms();
1001 let (created_since_review, threshold, should_request) = {
1002 let mut guard = self.automation_governance.write().await;
1003 let threshold = guard.limits.per_agent_creation_review_threshold;
1004 let (created_since_review, should_request) = {
1005 let summary = guard
1006 .agent_creation_reviews
1007 .entry(agent_id.to_string())
1008 .or_insert_with(|| AgentCreationReviewSummary::new(agent_id.to_string(), now));
1009 summary.created_since_review = summary.created_since_review.saturating_add(1);
1010 summary.updated_at_ms = now;
1011 let should_request = threshold > 0
1012 && summary.created_since_review >= threshold
1013 && !summary.review_required;
1014 if should_request {
1015 summary.review_required = true;
1016 summary.review_kind = Some(AutomationLifecycleReviewKind::CreationQuota);
1017 summary.review_requested_at_ms = Some(now);
1018 }
1019 (summary.created_since_review, should_request)
1020 };
1021 guard.updated_at_ms = now;
1022 (created_since_review, threshold, should_request)
1023 };
1024 self.persist_automation_governance().await?;
1025 if should_request {
1026 let _ = self
1027 .request_approval(
1028 GovernanceApprovalRequestType::LifecycleReview,
1029 GovernanceActorRef::system("automation_creation_review"),
1030 GovernanceResourceRef {
1031 resource_type: "agent".to_string(),
1032 id: agent_id.to_string(),
1033 },
1034 format!(
1035 "Human acknowledgment required after agent {agent_id} created {created_since_review} automations"
1036 ),
1037 json!({
1038 "trigger": "creation_quota",
1039 "agentID": agent_id,
1040 "automationID": automation_id,
1041 "createdSinceReview": created_since_review,
1042 "creationReviewThreshold": threshold,
1043 }),
1044 None,
1045 )
1046 .await;
1047 }
1048 Ok(())
1049 }
1050
1051 pub async fn acknowledge_agent_creation_review(
1052 &self,
1053 agent_id: &str,
1054 reviewer: GovernanceActorRef,
1055 notes: Option<String>,
1056 ) -> anyhow::Result<()> {
1057 let now = now_ms();
1058 {
1059 let mut guard = self.automation_governance.write().await;
1060 let summary = guard
1061 .agent_creation_reviews
1062 .entry(agent_id.to_string())
1063 .or_insert_with(|| AgentCreationReviewSummary::new(agent_id.to_string(), now));
1064 summary.created_since_review = 0;
1065 summary.review_required = false;
1066 summary.review_kind = None;
1067 summary.review_requested_at_ms = None;
1068 summary.review_request_id = None;
1069 summary.last_reviewed_at_ms = Some(now);
1070 summary.last_review_notes = notes.clone();
1071 summary.updated_at_ms = now;
1072 guard.updated_at_ms = now;
1073 }
1074 self.persist_automation_governance().await?;
1075 let _ = append_protected_audit_event(
1076 self,
1077 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.review.agent_acknowledged"),
1078 &tandem_types::TenantContext::local_implicit(),
1079 reviewer
1080 .actor_id
1081 .clone()
1082 .or_else(|| reviewer.source.clone()),
1083 json!({
1084 "agentID": agent_id,
1085 "reviewer": reviewer,
1086 "notes": notes,
1087 }),
1088 )
1089 .await;
1090 Ok(())
1091 }
1092
1093 pub async fn acknowledge_automation_review(
1094 &self,
1095 automation_id: &str,
1096 reviewer: GovernanceActorRef,
1097 notes: Option<String>,
1098 ) -> anyhow::Result<Option<AutomationGovernanceRecord>> {
1099 let stored = {
1100 let mut guard = self.automation_governance.write().await;
1101 let stored = {
1102 let Some(record) = guard.records.get_mut(automation_id) else {
1103 return Ok(None);
1104 };
1105 let now = now_ms();
1106 record.review_required = false;
1107 record.review_kind = None;
1108 record.review_requested_at_ms = None;
1109 record.review_request_id = None;
1110 record.last_reviewed_at_ms = Some(now);
1111 record.runs_since_review = 0;
1112 record.health_findings.clear();
1113 record.health_last_checked_at_ms = Some(now);
1114 record.updated_at_ms = now;
1115 record.clone()
1116 };
1117 guard.updated_at_ms = now_ms();
1118 stored
1119 };
1120 self.persist_automation_governance().await?;
1121 let _ = append_protected_audit_event(
1122 self,
1123 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.review.automation_acknowledged"),
1124 &tandem_types::TenantContext::local_implicit(),
1125 reviewer
1126 .actor_id
1127 .clone()
1128 .or_else(|| reviewer.source.clone()),
1129 json!({
1130 "automationID": automation_id,
1131 "reviewer": reviewer,
1132 "notes": notes,
1133 }),
1134 )
1135 .await;
1136 Ok(Some(stored))
1137 }
1138
1139 pub async fn pause_automation_for_dependency_revocation(
1140 &self,
1141 automation_id: &str,
1142 reason: String,
1143 evidence: Value,
1144 ) -> anyhow::Result<()> {
1145 let Some(automation) = self.get_automation_v2(automation_id).await else {
1146 anyhow::bail!("automation not found");
1147 };
1148 let now = now_ms();
1149 let paused_runs = self
1150 .pause_running_automation_v2_runs(
1151 automation_id,
1152 reason.clone(),
1153 crate::AutomationStopKind::GuardrailStopped,
1154 )
1155 .await;
1156
1157 let dependency_context = json!({
1158 "trigger": "dependency_revoked",
1159 "reason": reason.clone(),
1160 "evidence": evidence,
1161 "pausedRunIDs": paused_runs.clone(),
1162 });
1163 let finding = AutomationLifecycleFinding {
1164 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1165 kind: AutomationLifecycleReviewKind::DependencyRevoked,
1166 severity: AutomationLifecycleFindingSeverity::Critical,
1167 summary: "automation paused after dependency revocation".to_string(),
1168 detail: Some(
1169 "an owned grant or connected MCP capability was removed and the automation was paused pending review"
1170 .to_string(),
1171 ),
1172 observed_at_ms: now,
1173 automation_run_id: None,
1174 approval_id: None,
1175 evidence: Some(dependency_context.clone()),
1176 };
1177
1178 let pending_review_id = {
1179 let guard = self.automation_governance.read().await;
1180 guard
1181 .approvals
1182 .values()
1183 .filter(|request| {
1184 request.request_type == GovernanceApprovalRequestType::LifecycleReview
1185 && request.status == GovernanceApprovalStatus::Pending
1186 && request.target_resource.resource_type == "automation"
1187 && request.target_resource.id == automation_id
1188 })
1189 .max_by_key(|request| request.updated_at_ms)
1190 .map(|request| request.approval_id.clone())
1191 };
1192
1193 {
1194 let mut guard = self.automation_governance.write().await;
1195 let record = guard
1196 .records
1197 .entry(automation_id.to_string())
1198 .or_insert_with(|| AutomationGovernanceRecord {
1199 automation_id: automation_id.to_string(),
1200 provenance: default_human_provenance(
1201 Some(automation.creator_id.clone()),
1202 "dependency_revocation_default",
1203 ),
1204 declared_capabilities: declared_capabilities_for_automation(&automation),
1205 modify_grants: Vec::new(),
1206 capability_grants: Vec::new(),
1207 created_at_ms: automation.created_at_ms,
1208 updated_at_ms: now,
1209 deleted_at_ms: None,
1210 delete_retention_until_ms: None,
1211 published_externally: false,
1212 creation_paused: false,
1213 review_required: false,
1214 review_kind: None,
1215 review_requested_at_ms: None,
1216 review_request_id: None,
1217 last_reviewed_at_ms: None,
1218 runs_since_review: 0,
1219 expires_at_ms: None,
1220 expired_at_ms: None,
1221 retired_at_ms: None,
1222 retire_reason: None,
1223 paused_for_lifecycle: false,
1224 health_last_checked_at_ms: None,
1225 health_findings: Vec::new(),
1226 });
1227 record.declared_capabilities = declared_capabilities_for_automation(&automation);
1228 record.paused_for_lifecycle = true;
1229 record.review_required = true;
1230 record.review_kind = Some(AutomationLifecycleReviewKind::DependencyRevoked);
1231 record.review_requested_at_ms = Some(now);
1232 record.review_request_id = pending_review_id.clone();
1233 record.health_last_checked_at_ms = Some(now);
1234 record.health_findings.push(finding.clone());
1235 record.updated_at_ms = now;
1236 guard.updated_at_ms = now;
1237 }
1238 self.persist_automation_governance().await?;
1239
1240 let mut created_review_id = pending_review_id;
1241 if created_review_id.is_none() {
1242 if let Ok(approval) = self
1243 .request_approval(
1244 GovernanceApprovalRequestType::LifecycleReview,
1245 GovernanceActorRef::system("automation_dependency_revocation"),
1246 GovernanceResourceRef {
1247 resource_type: "automation".to_string(),
1248 id: automation_id.to_string(),
1249 },
1250 format!(
1251 "Human review required after dependency revocation paused automation {automation_id}"
1252 ),
1253 dependency_context.clone(),
1254 None,
1255 )
1256 .await
1257 {
1258 created_review_id = Some(approval.approval_id.clone());
1259 {
1260 let mut guard = self.automation_governance.write().await;
1261 if let Some(record) = guard.records.get_mut(automation_id) {
1262 record.review_request_id = created_review_id.clone();
1263 record.updated_at_ms = now_ms();
1264 }
1265 guard.updated_at_ms = now_ms();
1266 }
1267 self.persist_automation_governance().await?;
1268 }
1269 }
1270
1271 let _ = append_protected_audit_event(
1272 self,
1273 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.dependency_revoked"),
1274 &tandem_types::TenantContext::local_implicit(),
1275 Some("automation_dependency_revocation".to_string()),
1276 json!({
1277 "automationID": automation_id,
1278 "reason": reason,
1279 "pausedRunIDs": paused_runs,
1280 "evidence": dependency_context.clone(),
1281 "reviewRequestID": created_review_id,
1282 }),
1283 )
1284 .await;
1285
1286 Ok(())
1287 }
1288
1289 async fn pause_running_automation_v2_runs(
1290 &self,
1291 automation_id: &str,
1292 reason: String,
1293 stop_kind: crate::AutomationStopKind,
1294 ) -> Vec<String> {
1295 let runs = self.list_automation_v2_runs(Some(automation_id), 100).await;
1296 let mut paused_runs = Vec::new();
1297 for run in runs {
1298 if run.status != crate::AutomationRunStatus::Running {
1299 continue;
1300 }
1301 let session_ids = run.active_session_ids.clone();
1302 let instance_ids = run.active_instance_ids.clone();
1303 let _ = self
1304 .update_automation_v2_run(&run.run_id, |row| {
1305 row.status = crate::AutomationRunStatus::Pausing;
1306 row.pause_reason = Some(reason.clone());
1307 })
1308 .await;
1309 for session_id in &session_ids {
1310 let _ = self.cancellations.cancel(session_id).await;
1311 }
1312 for instance_id in instance_ids {
1313 let _ = self
1314 .agent_teams
1315 .cancel_instance(self, &instance_id, &reason)
1316 .await;
1317 }
1318 self.forget_automation_v2_sessions(&session_ids).await;
1319 let _ = self
1320 .update_automation_v2_run(&run.run_id, |row| {
1321 row.status = crate::AutomationRunStatus::Paused;
1322 row.active_session_ids.clear();
1323 row.active_instance_ids.clear();
1324 row.pause_reason = Some(reason.clone());
1325 row.stop_kind = Some(stop_kind.clone());
1326 row.stop_reason = Some(reason.clone());
1327 crate::app::state::automation::lifecycle::record_automation_lifecycle_event(
1328 row,
1329 "run_paused_governance",
1330 Some(reason.clone()),
1331 Some(stop_kind.clone()),
1332 );
1333 })
1334 .await;
1335 paused_runs.push(run.run_id);
1336 }
1337 paused_runs
1338 }
1339
1340 pub async fn record_automation_review_progress(
1341 &self,
1342 automation_id: &str,
1343 reason: AutomationLifecycleReviewKind,
1344 run_id: Option<String>,
1345 detail: Option<String>,
1346 ) -> anyhow::Result<()> {
1347 let now = now_ms();
1348 let (should_request, review_count) = {
1349 let mut guard = self.automation_governance.write().await;
1350 let threshold = guard.limits.run_review_threshold;
1351 let (should_request, review_count) = {
1352 let Some(record) = guard.records.get_mut(automation_id) else {
1353 return Ok(());
1354 };
1355 record.runs_since_review = record.runs_since_review.saturating_add(1);
1356 record.health_last_checked_at_ms = Some(now);
1357 record.updated_at_ms = now;
1358 let should_request = threshold > 0
1359 && record.runs_since_review >= threshold
1360 && !record.review_required;
1361 if should_request {
1362 record.review_required = true;
1363 record.review_kind = Some(reason);
1364 record.review_requested_at_ms = Some(now);
1365 }
1366 (should_request, record.runs_since_review)
1367 };
1368 guard.updated_at_ms = now;
1369 (should_request, review_count)
1370 };
1371 self.persist_automation_governance().await?;
1372 if should_request {
1373 let _ = self
1374 .request_approval(
1375 GovernanceApprovalRequestType::LifecycleReview,
1376 GovernanceActorRef::system("automation_lifecycle_review"),
1377 GovernanceResourceRef {
1378 resource_type: "automation".to_string(),
1379 id: automation_id.to_string(),
1380 },
1381 format!(
1382 "Human review required after automation {automation_id} completed {review_count} runs without acknowledgment"
1383 ),
1384 json!({
1385 "trigger": "run_drift",
1386 "automationID": automation_id,
1387 "runID": run_id,
1388 "detail": detail,
1389 "runCountSinceReview": review_count,
1390 "reviewKind": "run_drift",
1391 }),
1392 None,
1393 )
1394 .await;
1395 }
1396 Ok(())
1397 }
1398
1399 pub async fn run_automation_governance_health_check(&self) -> anyhow::Result<usize> {
1400 let now = now_ms();
1401 let limits = self.automation_governance.read().await.limits.clone();
1402 let automations = self.list_automations_v2().await;
1403 let mut finding_count = 0usize;
1404
1405 for automation in automations {
1406 let runs = self
1407 .list_automation_v2_runs(
1408 Some(&automation.automation_id),
1409 limits.health_window_run_limit.max(5) as usize,
1410 )
1411 .await;
1412 let terminal_runs = runs
1413 .iter()
1414 .filter(|run| {
1415 matches!(
1416 run.status,
1417 crate::AutomationRunStatus::Completed
1418 | crate::AutomationRunStatus::Blocked
1419 | crate::AutomationRunStatus::Failed
1420 | crate::AutomationRunStatus::Cancelled
1421 )
1422 })
1423 .collect::<Vec<_>>();
1424 let failure_count = terminal_runs
1425 .iter()
1426 .filter(|run| {
1427 matches!(
1428 run.status,
1429 crate::AutomationRunStatus::Failed | crate::AutomationRunStatus::Blocked
1430 )
1431 })
1432 .count();
1433 let empty_output_count = terminal_runs
1434 .iter()
1435 .filter(|run| {
1436 run.status == crate::AutomationRunStatus::Completed
1437 && run.checkpoint.node_outputs.is_empty()
1438 })
1439 .count();
1440 let guardrail_stop_count = terminal_runs
1441 .iter()
1442 .filter(|run| run.stop_kind == Some(crate::AutomationStopKind::GuardrailStopped))
1443 .count();
1444
1445 let mut findings = Vec::new();
1446 let mut automation_expires_at_ms = None;
1447 if !terminal_runs.is_empty() {
1448 let failure_rate = failure_count as f64 / terminal_runs.len() as f64;
1449 if failure_rate >= limits.health_failure_rate_threshold && terminal_runs.len() >= 5
1450 {
1451 findings.push(AutomationLifecycleFinding {
1452 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1453 kind: AutomationLifecycleReviewKind::HealthDrift,
1454 severity: if failure_rate >= 0.75 {
1455 AutomationLifecycleFindingSeverity::Critical
1456 } else {
1457 AutomationLifecycleFindingSeverity::Warning
1458 },
1459 summary: "high failure rate across recent runs".to_string(),
1460 detail: Some(format!(
1461 "{} of {} recent terminal runs failed or were blocked ({:.0}% failure rate)",
1462 failure_count,
1463 terminal_runs.len(),
1464 failure_rate * 100.0
1465 )),
1466 observed_at_ms: now,
1467 automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1468 approval_id: None,
1469 evidence: Some(json!({
1470 "failureCount": failure_count,
1471 "terminalRunCount": terminal_runs.len(),
1472 "failureRate": failure_rate,
1473 })),
1474 });
1475 }
1476 }
1477 if empty_output_count > 0 {
1478 findings.push(AutomationLifecycleFinding {
1479 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1480 kind: AutomationLifecycleReviewKind::HealthDrift,
1481 severity: AutomationLifecycleFindingSeverity::Warning,
1482 summary: "completed runs emitted empty outputs".to_string(),
1483 detail: Some(format!(
1484 "{} recent completed runs produced no node outputs",
1485 empty_output_count
1486 )),
1487 observed_at_ms: now,
1488 automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1489 approval_id: None,
1490 evidence: Some(json!({
1491 "emptyOutputCount": empty_output_count,
1492 })),
1493 });
1494 }
1495 if guardrail_stop_count >= limits.health_guardrail_stop_threshold as usize
1496 && limits.health_guardrail_stop_threshold > 0
1497 {
1498 findings.push(AutomationLifecycleFinding {
1499 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1500 kind: AutomationLifecycleReviewKind::HealthDrift,
1501 severity: AutomationLifecycleFindingSeverity::Warning,
1502 summary: "repeated guardrail stops detected".to_string(),
1503 detail: Some(format!(
1504 "{} recent terminal runs stopped on guardrails",
1505 guardrail_stop_count
1506 )),
1507 observed_at_ms: now,
1508 automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1509 approval_id: None,
1510 evidence: Some(json!({
1511 "guardrailStopCount": guardrail_stop_count,
1512 })),
1513 });
1514 }
1515
1516 let mut should_create_review_request = false;
1517 let mut should_create_retirement_request = false;
1518 let mut should_pause_expired = false;
1519 {
1520 let mut guard = self.automation_governance.write().await;
1521 let has_pending_lifecycle_review = guard.has_pending_approval_request(
1522 GovernanceApprovalRequestType::LifecycleReview,
1523 "automation",
1524 &automation.automation_id,
1525 );
1526 let has_pending_retirement_request = guard.has_pending_approval_request(
1527 GovernanceApprovalRequestType::RetirementAction,
1528 "automation",
1529 &automation.automation_id,
1530 );
1531 let Some(record) = guard.records.get_mut(&automation.automation_id) else {
1532 continue;
1533 };
1534 automation_expires_at_ms = record.expires_at_ms;
1535 record.health_last_checked_at_ms = Some(now);
1536 record.health_findings = findings.clone();
1537 if !findings.is_empty() {
1538 record.review_required = true;
1539 record.review_kind = Some(AutomationLifecycleReviewKind::HealthDrift);
1540 if record.review_requested_at_ms.is_none() {
1541 record.review_requested_at_ms = Some(now);
1542 }
1543 should_create_review_request = !has_pending_lifecycle_review;
1544 }
1545 if let Some(expires_at_ms) = record.expires_at_ms {
1546 if now >= expires_at_ms && record.expired_at_ms.is_none() {
1547 record.expired_at_ms = Some(now);
1548 record.review_required = true;
1549 record.review_kind = Some(AutomationLifecycleReviewKind::Expired);
1550 record.review_requested_at_ms = Some(now);
1551 record.paused_for_lifecycle = true;
1552 should_pause_expired = true;
1553 should_create_retirement_request = !has_pending_retirement_request;
1554 findings.push(AutomationLifecycleFinding {
1555 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1556 kind: AutomationLifecycleReviewKind::Expired,
1557 severity: AutomationLifecycleFindingSeverity::Critical,
1558 summary: "automation has expired and was paused".to_string(),
1559 detail: Some(format!(
1560 "automation expired at {} and has been paused for human review",
1561 expires_at_ms
1562 )),
1563 observed_at_ms: now,
1564 automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1565 approval_id: None,
1566 evidence: Some(json!({
1567 "expiresAtMs": expires_at_ms,
1568 "expiredAtMs": now,
1569 })),
1570 });
1571 } else if expires_at_ms > now
1572 && expires_at_ms.saturating_sub(now) <= limits.expiration_warning_window_ms
1573 {
1574 record.review_required = true;
1575 record.review_kind = Some(AutomationLifecycleReviewKind::ExpirationSoon);
1576 if record.review_requested_at_ms.is_none() {
1577 record.review_requested_at_ms = Some(now);
1578 }
1579 should_create_retirement_request = !has_pending_retirement_request;
1580 findings.push(AutomationLifecycleFinding {
1581 finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1582 kind: AutomationLifecycleReviewKind::ExpirationSoon,
1583 severity: AutomationLifecycleFindingSeverity::Info,
1584 summary: "automation is approaching its expiration date".to_string(),
1585 detail: Some(format!(
1586 "automation expires in {}ms",
1587 expires_at_ms.saturating_sub(now)
1588 )),
1589 observed_at_ms: now,
1590 automation_run_id: None,
1591 approval_id: None,
1592 evidence: Some(json!({
1593 "expiresAtMs": expires_at_ms,
1594 "warningWindowMs": limits.expiration_warning_window_ms,
1595 })),
1596 });
1597 }
1598 }
1599 record.health_findings = findings.clone();
1600 record.updated_at_ms = now;
1601 guard.updated_at_ms = now;
1602 }
1603 self.persist_automation_governance().await?;
1604
1605 if should_pause_expired && automation.status != crate::AutomationV2Status::Paused {
1606 let mut paused = automation.clone();
1607 paused.status = crate::AutomationV2Status::Paused;
1608 let _ = self.put_automation_v2(paused).await;
1609 let _ = self
1610 .pause_running_automation_v2_runs(
1611 &automation.automation_id,
1612 format!(
1613 "automation expired after reaching {}ms retention",
1614 limits.default_expires_after_ms
1615 ),
1616 crate::AutomationStopKind::GuardrailStopped,
1617 )
1618 .await;
1619 }
1620
1621 if should_create_review_request {
1622 let _ = self
1623 .request_approval(
1624 GovernanceApprovalRequestType::LifecycleReview,
1625 GovernanceActorRef::system("automation_health_check"),
1626 GovernanceResourceRef {
1627 resource_type: "automation".to_string(),
1628 id: automation.automation_id.clone(),
1629 },
1630 format!(
1631 "Human review required after health check detected drift in automation {}",
1632 automation.automation_id
1633 ),
1634 json!({
1635 "trigger": "health_drift",
1636 "automationID": automation.automation_id,
1637 "findingCount": findings.len(),
1638 }),
1639 None,
1640 )
1641 .await;
1642 }
1643
1644 if should_create_retirement_request {
1645 let _ = self
1646 .request_approval(
1647 GovernanceApprovalRequestType::RetirementAction,
1648 GovernanceActorRef::system("automation_expiration"),
1649 GovernanceResourceRef {
1650 resource_type: "automation".to_string(),
1651 id: automation.automation_id.clone(),
1652 },
1653 format!(
1654 "Automation {} is expiring or has expired and needs operator action",
1655 automation.automation_id
1656 ),
1657 json!({
1658 "trigger": if should_pause_expired {
1659 "expired"
1660 } else {
1661 "expiration_soon"
1662 },
1663 "automationID": automation.automation_id,
1664 "expiresAtMs": automation_expires_at_ms,
1665 }),
1666 None,
1667 )
1668 .await;
1669 }
1670
1671 finding_count += findings.len();
1672 }
1673
1674 Ok(finding_count)
1675 }
1676
1677 pub async fn retire_automation_v2(
1678 &self,
1679 automation_id: &str,
1680 actor: GovernanceActorRef,
1681 reason: Option<String>,
1682 ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
1683 let Some(mut automation) = self.get_automation_v2(automation_id).await else {
1684 return Ok(None);
1685 };
1686 let now = now_ms();
1687 let reason = reason.unwrap_or_else(|| "retired by operator".to_string());
1688 automation.status = crate::AutomationV2Status::Paused;
1689 let stored = self.put_automation_v2(automation).await?;
1690 let _ = self
1691 .pause_running_automation_v2_runs(
1692 automation_id,
1693 reason.clone(),
1694 crate::AutomationStopKind::OperatorStopped,
1695 )
1696 .await;
1697 {
1698 let mut guard = self.automation_governance.write().await;
1699 let record = guard
1700 .records
1701 .entry(automation_id.to_string())
1702 .or_insert_with(|| AutomationGovernanceRecord {
1703 automation_id: automation_id.to_string(),
1704 provenance: default_human_provenance(
1705 Some(stored.creator_id.clone()),
1706 "retire_default",
1707 ),
1708 declared_capabilities: declared_capabilities_for_automation(&stored),
1709 modify_grants: Vec::new(),
1710 capability_grants: Vec::new(),
1711 created_at_ms: stored.created_at_ms,
1712 updated_at_ms: now,
1713 deleted_at_ms: None,
1714 delete_retention_until_ms: None,
1715 published_externally: false,
1716 creation_paused: false,
1717 review_required: false,
1718 review_kind: None,
1719 review_requested_at_ms: None,
1720 review_request_id: None,
1721 last_reviewed_at_ms: None,
1722 runs_since_review: 0,
1723 expires_at_ms: None,
1724 expired_at_ms: None,
1725 retired_at_ms: None,
1726 retire_reason: None,
1727 paused_for_lifecycle: false,
1728 health_last_checked_at_ms: None,
1729 health_findings: Vec::new(),
1730 });
1731 record.retired_at_ms = Some(now);
1732 record.retire_reason = Some(reason.clone());
1733 record.paused_for_lifecycle = true;
1734 record.review_required = false;
1735 record.review_kind = Some(AutomationLifecycleReviewKind::Retired);
1736 record.review_requested_at_ms = Some(now);
1737 record.updated_at_ms = now;
1738 guard.updated_at_ms = now;
1739 }
1740 self.persist_automation_governance().await?;
1741 let _ = append_protected_audit_event(
1742 self,
1743 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.retired"),
1744 &tandem_types::TenantContext::local_implicit(),
1745 actor.actor_id.clone().or_else(|| actor.source.clone()),
1746 json!({
1747 "automationID": automation_id,
1748 "reason": reason,
1749 "actor": actor,
1750 }),
1751 )
1752 .await;
1753 Ok(Some(stored))
1754 }
1755
1756 pub async fn extend_automation_v2_retirement(
1757 &self,
1758 automation_id: &str,
1759 actor: GovernanceActorRef,
1760 expires_at_ms: Option<u64>,
1761 reason: Option<String>,
1762 ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
1763 let Some(mut automation) = self.get_automation_v2(automation_id).await else {
1764 return Ok(None);
1765 };
1766 let now = now_ms();
1767 let default_expires_after_ms = self
1768 .automation_governance
1769 .read()
1770 .await
1771 .limits
1772 .default_expires_after_ms;
1773 let next_expires_at_ms =
1774 expires_at_ms.unwrap_or_else(|| now.saturating_add(default_expires_after_ms.max(1)));
1775 automation.status = crate::AutomationV2Status::Active;
1776 let stored = self.put_automation_v2(automation).await?;
1777 {
1778 let mut guard = self.automation_governance.write().await;
1779 let record = guard
1780 .records
1781 .entry(automation_id.to_string())
1782 .or_insert_with(|| AutomationGovernanceRecord {
1783 automation_id: automation_id.to_string(),
1784 provenance: default_human_provenance(
1785 Some(stored.creator_id.clone()),
1786 "extend_default",
1787 ),
1788 declared_capabilities: declared_capabilities_for_automation(&stored),
1789 modify_grants: Vec::new(),
1790 capability_grants: Vec::new(),
1791 created_at_ms: stored.created_at_ms,
1792 updated_at_ms: now,
1793 deleted_at_ms: None,
1794 delete_retention_until_ms: None,
1795 published_externally: false,
1796 creation_paused: false,
1797 review_required: false,
1798 review_kind: None,
1799 review_requested_at_ms: None,
1800 review_request_id: None,
1801 last_reviewed_at_ms: None,
1802 runs_since_review: 0,
1803 expires_at_ms: None,
1804 expired_at_ms: None,
1805 retired_at_ms: None,
1806 retire_reason: None,
1807 paused_for_lifecycle: false,
1808 health_last_checked_at_ms: None,
1809 health_findings: Vec::new(),
1810 });
1811 record.expires_at_ms = Some(next_expires_at_ms);
1812 record.expired_at_ms = None;
1813 record.retired_at_ms = None;
1814 record.retire_reason = None;
1815 record.paused_for_lifecycle = false;
1816 record.review_required = false;
1817 record.review_kind = None;
1818 record.review_requested_at_ms = None;
1819 record.review_request_id = None;
1820 record.last_reviewed_at_ms = Some(now);
1821 record.health_last_checked_at_ms = Some(now);
1822 record.updated_at_ms = now;
1823 guard.updated_at_ms = now;
1824 }
1825 self.persist_automation_governance().await?;
1826 let _ = append_protected_audit_event(
1827 self,
1828 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.retirement.extended"),
1829 &tandem_types::TenantContext::local_implicit(),
1830 actor.actor_id.clone().or_else(|| actor.source.clone()),
1831 json!({
1832 "automationID": automation_id,
1833 "expiresAtMs": next_expires_at_ms,
1834 "reason": reason,
1835 "actor": actor,
1836 }),
1837 )
1838 .await;
1839 Ok(Some(stored))
1840 }
1841
1842 pub async fn record_automation_v2_spend(
1843 &self,
1844 run_id: &str,
1845 prompt_tokens: u64,
1846 completion_tokens: u64,
1847 total_tokens: u64,
1848 delta_cost_usd: f64,
1849 ) -> anyhow::Result<()> {
1850 let Some(run_snapshot) = self.get_automation_v2_run(run_id).await else {
1851 return Ok(());
1852 };
1853 let automation = if let Some(snapshot) = run_snapshot.automation_snapshot.clone() {
1854 snapshot
1855 } else {
1856 let Some(automation) = self.get_automation_v2(&run_snapshot.automation_id).await else {
1857 return Ok(());
1858 };
1859 automation
1860 };
1861 let governance = self
1862 .get_or_bootstrap_automation_governance(&automation)
1863 .await;
1864 let agent_ids = governance.agent_lineage_ids();
1865 if agent_ids.is_empty() {
1866 return Ok(());
1867 }
1868
1869 let now = now_ms();
1870 let (weekly_cap, warning_threshold_ratio) = {
1871 let guard = self.automation_governance.read().await;
1872 (
1873 guard.limits.weekly_spend_cap_usd,
1874 guard.limits.spend_warning_threshold_ratio,
1875 )
1876 };
1877
1878 let mut warning_events: Vec<(String, f64, f64)> = Vec::new();
1879 let mut hard_stop_agents: Vec<(String, f64, f64)> = Vec::new();
1880 {
1881 let mut guard = self.automation_governance.write().await;
1882 for agent_id in &agent_ids {
1883 let has_override = guard.has_approved_agent_quota_override(agent_id);
1884 let mut hard_stop_entry: Option<(String, f64, f64)> = None;
1885 let summary = guard
1886 .agent_spend
1887 .entry(agent_id.clone())
1888 .or_insert_with(|| AgentSpendSummary::new(agent_id.clone(), now));
1889 summary.apply_usage(
1890 now,
1891 Some(&automation.automation_id),
1892 Some(run_id),
1893 prompt_tokens,
1894 completion_tokens,
1895 total_tokens,
1896 delta_cost_usd,
1897 );
1898 if let Some(limit) = weekly_cap {
1899 if summary.weekly_warning_threshold_reached(limit, warning_threshold_ratio)
1900 && summary.weekly.soft_warning_at_ms.is_none()
1901 {
1902 summary.weekly.soft_warning_at_ms = Some(now);
1903 warning_events.push((agent_id.clone(), summary.weekly.cost_usd, limit));
1904 }
1905 if summary.weekly_limit_reached(limit)
1906 && summary.weekly.hard_stop_at_ms.is_none()
1907 && !has_override
1908 {
1909 summary.weekly.hard_stop_at_ms = Some(now);
1910 summary.paused_at_ms = Some(now);
1911 summary.pause_reason =
1912 Some(format!("weekly spend cap {:.2} USD reached", limit));
1913 hard_stop_entry = Some((agent_id.clone(), summary.weekly.cost_usd, limit));
1914 }
1915 }
1916 if let Some((agent_id, cost_usd, limit_usd)) = hard_stop_entry {
1917 if !guard
1918 .spend_paused_agents
1919 .iter()
1920 .any(|value| value == &agent_id)
1921 {
1922 guard.spend_paused_agents.push(agent_id.clone());
1923 }
1924 hard_stop_agents.push((agent_id, cost_usd, limit_usd));
1925 }
1926 }
1927 guard.updated_at_ms = now;
1928 }
1929 self.persist_automation_governance().await?;
1930
1931 for (agent_id, cost_usd, limit_usd) in warning_events {
1932 let _ = append_protected_audit_event(
1933 self,
1934 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.spend.warning"),
1935 &tandem_types::TenantContext::local_implicit(),
1936 governance
1937 .provenance
1938 .creator
1939 .actor_id
1940 .clone()
1941 .or_else(|| Some(automation.creator_id.clone())),
1942 json!({
1943 "automationID": automation.automation_id,
1944 "runID": run_id,
1945 "agentID": agent_id,
1946 "weeklyCostUsd": cost_usd,
1947 "weeklySpendCapUsd": limit_usd,
1948 }),
1949 )
1950 .await;
1951 }
1952
1953 let mut requested_approvals = Vec::new();
1954 for (agent_id, cost_usd, limit_usd) in &hard_stop_agents {
1955 let guard = self.automation_governance.read().await;
1956 let has_override = guard.has_pending_agent_quota_override(agent_id)
1957 || guard.has_approved_agent_quota_override(agent_id);
1958 drop(guard);
1959 if has_override {
1960 continue;
1961 }
1962 if let Ok(approval) = self
1963 .request_approval(
1964 GovernanceApprovalRequestType::QuotaOverride,
1965 GovernanceActorRef::system("automation_spend_cap"),
1966 GovernanceResourceRef {
1967 resource_type: "agent".to_string(),
1968 id: agent_id.clone(),
1969 },
1970 format!(
1971 "Approve temporary quota override after agent {agent_id} reached weekly spend cap"
1972 ),
1973 json!({
1974 "automationID": automation.automation_id,
1975 "runID": run_id,
1976 "agentID": agent_id,
1977 "weeklyCostUsd": cost_usd,
1978 "weeklySpendCapUsd": limit_usd,
1979 "reason": "agent weekly spend cap exceeded",
1980 }),
1981 None,
1982 )
1983 .await
1984 {
1985 requested_approvals.push(approval.approval_id);
1986 }
1987 }
1988
1989 if !hard_stop_agents.is_empty() {
1990 let session_ids = run_snapshot.active_session_ids.clone();
1991 for session_id in &session_ids {
1992 let _ = self.cancellations.cancel(session_id).await;
1993 }
1994 self.forget_automation_v2_sessions(&session_ids).await;
1995 let instance_ids = run_snapshot.active_instance_ids.clone();
1996 for instance_id in instance_ids {
1997 let _ = self
1998 .agent_teams
1999 .cancel_instance(self, &instance_id, "paused by spend guardrail")
2000 .await;
2001 }
2002 let paused_agent_labels = hard_stop_agents
2003 .iter()
2004 .map(|(agent_id, cost_usd, limit_usd)| {
2005 format!("{agent_id} ({cost_usd:.4}/{limit_usd:.4} USD)")
2006 })
2007 .collect::<Vec<_>>()
2008 .join(", ");
2009 let detail = format!("weekly spend cap exceeded for {paused_agent_labels}");
2010 let _ = self
2011 .update_automation_v2_run(run_id, |row| {
2012 row.status = crate::AutomationRunStatus::Paused;
2013 row.detail = Some(detail.clone());
2014 row.pause_reason = Some(detail.clone());
2015 row.stop_kind = Some(crate::AutomationStopKind::GuardrailStopped);
2016 row.stop_reason = Some(detail.clone());
2017 row.active_session_ids.clear();
2018 row.latest_session_id = None;
2019 row.active_instance_ids.clear();
2020 crate::app::state::automation::lifecycle::record_automation_lifecycle_event(
2021 row,
2022 "run_paused_spend_cap_exceeded",
2023 Some(detail.clone()),
2024 Some(crate::AutomationStopKind::GuardrailStopped),
2025 );
2026 })
2027 .await;
2028 let _ = append_protected_audit_event(
2029 self,
2030 format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.spend.paused"),
2031 &tandem_types::TenantContext::local_implicit(),
2032 governance
2033 .provenance
2034 .creator
2035 .actor_id
2036 .clone()
2037 .or_else(|| Some(automation.creator_id.clone())),
2038 json!({
2039 "automationID": automation.automation_id,
2040 "runID": run_id,
2041 "pausedAgents": hard_stop_agents
2042 .iter()
2043 .map(|(agent_id, _, _)| agent_id)
2044 .cloned()
2045 .collect::<Vec<_>>(),
2046 "requestedApprovals": requested_approvals,
2047 "detail": detail,
2048 }),
2049 )
2050 .await;
2051 }
2052
2053 Ok(())
2054 }
2055}