Skip to main content

tandem_server/app/state/
governance.rs

1use std::collections::HashMap;
2
3use serde_json::json;
4use serde_json::Value;
5use tokio::fs;
6use uuid::Uuid;
7
8use crate::audit::append_protected_audit_event;
9use crate::automation_v2::governance::*;
10use crate::{now_ms, AppState};
11
12const GOVERNANCE_AUDIT_EVENT_PREFIX: &str = "automation.governance";
13
14fn default_human_provenance(
15    creator_id: Option<String>,
16    source: impl Into<String>,
17) -> AutomationProvenanceRecord {
18    AutomationProvenanceRecord::human(creator_id, source)
19}
20
21fn declared_capabilities_for_automation(
22    automation: &crate::AutomationV2Spec,
23) -> AutomationDeclaredCapabilities {
24    AutomationDeclaredCapabilities::from_metadata(automation.metadata.as_ref())
25}
26
27impl AppState {
28    pub async fn load_automation_governance(&self) -> anyhow::Result<()> {
29        if !self.automation_governance_path.exists() {
30            return Ok(());
31        }
32        let raw = fs::read_to_string(&self.automation_governance_path).await?;
33        let parsed = serde_json::from_str::<GovernanceState>(&raw).unwrap_or_default();
34        *self.automation_governance.write().await = parsed;
35        Ok(())
36    }
37
38    pub async fn persist_automation_governance(&self) -> anyhow::Result<()> {
39        if let Some(parent) = self.automation_governance_path.parent() {
40            fs::create_dir_all(parent).await?;
41        }
42        let payload = {
43            let guard = self.automation_governance.read().await;
44            serde_json::to_string_pretty(&*guard)?
45        };
46        fs::write(&self.automation_governance_path, payload).await?;
47        Ok(())
48    }
49
50    async fn persist_automation_governance_locked(&self) -> anyhow::Result<()> {
51        self.persist_automation_governance().await
52    }
53
54    pub async fn bootstrap_automation_governance(&self) -> anyhow::Result<usize> {
55        let automations = self.list_automations_v2().await;
56        let now = now_ms();
57        let mut inserted = 0usize;
58        {
59            let mut guard = self.automation_governance.write().await;
60            for automation in automations {
61                if guard.records.contains_key(&automation.automation_id) {
62                    continue;
63                }
64                guard.records.insert(
65                    automation.automation_id.clone(),
66                    AutomationGovernanceRecord {
67                        automation_id: automation.automation_id.clone(),
68                        provenance: default_human_provenance(
69                            Some(automation.creator_id.clone()),
70                            "migration_or_legacy_default",
71                        ),
72                        declared_capabilities: declared_capabilities_for_automation(&automation),
73                        modify_grants: Vec::new(),
74                        capability_grants: Vec::new(),
75                        created_at_ms: automation.created_at_ms.max(now),
76                        updated_at_ms: now,
77                        deleted_at_ms: None,
78                        delete_retention_until_ms: None,
79                        published_externally: false,
80                        creation_paused: false,
81                        review_required: false,
82                        review_kind: None,
83                        review_requested_at_ms: None,
84                        review_request_id: None,
85                        last_reviewed_at_ms: None,
86                        runs_since_review: 0,
87                        expires_at_ms: None,
88                        expired_at_ms: None,
89                        retired_at_ms: None,
90                        retire_reason: None,
91                        paused_for_lifecycle: false,
92                        health_last_checked_at_ms: None,
93                        health_findings: Vec::new(),
94                    },
95                );
96                inserted += 1;
97            }
98            guard.updated_at_ms = now;
99        }
100        if inserted > 0 {
101            self.persist_automation_governance().await?;
102        }
103        Ok(inserted)
104    }
105
106    pub async fn get_automation_governance(
107        &self,
108        automation_id: &str,
109    ) -> Option<AutomationGovernanceRecord> {
110        self.automation_governance
111            .read()
112            .await
113            .records
114            .get(automation_id)
115            .cloned()
116    }
117
118    pub async fn get_or_bootstrap_automation_governance(
119        &self,
120        automation: &crate::AutomationV2Spec,
121    ) -> AutomationGovernanceRecord {
122        if let Some(record) = self
123            .get_automation_governance(&automation.automation_id)
124            .await
125        {
126            return record;
127        }
128        let record = AutomationGovernanceRecord {
129            automation_id: automation.automation_id.clone(),
130            provenance: default_human_provenance(
131                Some(automation.creator_id.clone()),
132                "legacy_default",
133            ),
134            declared_capabilities: declared_capabilities_for_automation(automation),
135            modify_grants: Vec::new(),
136            capability_grants: Vec::new(),
137            created_at_ms: automation.created_at_ms,
138            updated_at_ms: now_ms(),
139            deleted_at_ms: None,
140            delete_retention_until_ms: None,
141            published_externally: false,
142            creation_paused: false,
143            review_required: false,
144            review_kind: None,
145            review_requested_at_ms: None,
146            review_request_id: None,
147            last_reviewed_at_ms: None,
148            runs_since_review: 0,
149            expires_at_ms: None,
150            expired_at_ms: None,
151            retired_at_ms: None,
152            retire_reason: None,
153            paused_for_lifecycle: false,
154            health_last_checked_at_ms: None,
155            health_findings: Vec::new(),
156        };
157        let _ = self.upsert_automation_governance(record.clone()).await;
158        record
159    }
160
161    pub async fn upsert_automation_governance(
162        &self,
163        mut record: AutomationGovernanceRecord,
164    ) -> anyhow::Result<AutomationGovernanceRecord> {
165        if record.automation_id.trim().is_empty() {
166            anyhow::bail!("automation_id is required");
167        }
168        let now = now_ms();
169        if record.created_at_ms == 0 {
170            record.created_at_ms = now;
171        }
172        record.updated_at_ms = now;
173        {
174            let mut guard = self.automation_governance.write().await;
175            guard
176                .records
177                .insert(record.automation_id.clone(), record.clone());
178            guard.updated_at_ms = now;
179        }
180        self.persist_automation_governance().await?;
181        let _ = append_protected_audit_event(
182            self,
183            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.record.updated"),
184            &tandem_types::TenantContext::local_implicit(),
185            record
186                .provenance
187                .creator
188                .actor_id
189                .clone()
190                .or_else(|| record.provenance.creator.source.clone()),
191            json!({
192                "automationID": record.automation_id,
193                "provenance": record.provenance,
194                "declaredCapabilities": record.declared_capabilities,
195                "publishedExternally": record.published_externally,
196                "creationPaused": record.creation_paused,
197            }),
198        )
199        .await;
200        Ok(record)
201    }
202
203    pub async fn set_automation_governance_provenance(
204        &self,
205        automation_id: &str,
206        provenance: AutomationProvenanceRecord,
207    ) -> anyhow::Result<AutomationGovernanceRecord> {
208        let mut record = self
209            .get_automation_governance(automation_id)
210            .await
211            .unwrap_or_else(|| AutomationGovernanceRecord {
212                automation_id: automation_id.to_string(),
213                provenance: provenance.clone(),
214                declared_capabilities: AutomationDeclaredCapabilities::default(),
215                modify_grants: Vec::new(),
216                capability_grants: Vec::new(),
217                created_at_ms: now_ms(),
218                updated_at_ms: now_ms(),
219                deleted_at_ms: None,
220                delete_retention_until_ms: None,
221                published_externally: false,
222                creation_paused: false,
223                review_required: false,
224                review_kind: None,
225                review_requested_at_ms: None,
226                review_request_id: None,
227                last_reviewed_at_ms: None,
228                runs_since_review: 0,
229                expires_at_ms: None,
230                expired_at_ms: None,
231                retired_at_ms: None,
232                retire_reason: None,
233                paused_for_lifecycle: false,
234                health_last_checked_at_ms: None,
235                health_findings: Vec::new(),
236            });
237        record.provenance = provenance;
238        if record.expires_at_ms.is_none()
239            && record.provenance.creator.kind == GovernanceActorKind::Agent
240        {
241            let default_expires_after_ms = self
242                .automation_governance
243                .read()
244                .await
245                .limits
246                .default_expires_after_ms;
247            if default_expires_after_ms > 0 {
248                record.expires_at_ms = Some(now_ms().saturating_add(default_expires_after_ms));
249            }
250        }
251        let stored = self.upsert_automation_governance(record).await?;
252        if let Some(agent_id) = stored
253            .provenance
254            .creator
255            .actor_id
256            .as_deref()
257            .filter(|_| stored.provenance.creator.kind == GovernanceActorKind::Agent)
258        {
259            let _ = self
260                .record_agent_creation_review_progress(agent_id, &stored.automation_id)
261                .await;
262        }
263        Ok(stored)
264    }
265
266    pub async fn sync_automation_governance_from_spec(
267        &self,
268        automation: &crate::AutomationV2Spec,
269        provenance: Option<AutomationProvenanceRecord>,
270    ) -> anyhow::Result<AutomationGovernanceRecord> {
271        let now = now_ms();
272        let mut record = self
273            .get_automation_governance(&automation.automation_id)
274            .await
275            .unwrap_or_else(|| AutomationGovernanceRecord {
276                automation_id: automation.automation_id.clone(),
277                provenance: provenance.clone().unwrap_or_else(|| {
278                    default_human_provenance(Some(automation.creator_id.clone()), "sync_default")
279                }),
280                declared_capabilities: declared_capabilities_for_automation(automation),
281                modify_grants: Vec::new(),
282                capability_grants: Vec::new(),
283                created_at_ms: automation.created_at_ms,
284                updated_at_ms: now,
285                deleted_at_ms: None,
286                delete_retention_until_ms: None,
287                published_externally: false,
288                creation_paused: false,
289                review_required: false,
290                review_kind: None,
291                review_requested_at_ms: None,
292                review_request_id: None,
293                last_reviewed_at_ms: None,
294                runs_since_review: 0,
295                expires_at_ms: None,
296                expired_at_ms: None,
297                retired_at_ms: None,
298                retire_reason: None,
299                paused_for_lifecycle: false,
300                health_last_checked_at_ms: None,
301                health_findings: Vec::new(),
302            });
303        if let Some(provenance) = provenance {
304            record.provenance = provenance;
305        }
306        record.declared_capabilities = declared_capabilities_for_automation(automation);
307        if record.created_at_ms == 0 {
308            record.created_at_ms = automation.created_at_ms;
309        }
310        record.updated_at_ms = now;
311        {
312            let mut guard = self.automation_governance.write().await;
313            guard
314                .records
315                .insert(record.automation_id.clone(), record.clone());
316            guard.updated_at_ms = now;
317        }
318        self.persist_automation_governance().await?;
319        if let Some(agent_id) = record
320            .provenance
321            .creator
322            .actor_id
323            .as_deref()
324            .filter(|_| record.provenance.creator.kind == GovernanceActorKind::Agent)
325        {
326            let _ = self
327                .record_agent_creation_review_progress(agent_id, &record.automation_id)
328                .await;
329        }
330        Ok(record)
331    }
332
333    pub async fn pause_automation_creation_for_agent(
334        &self,
335        agent_id: &str,
336        paused: bool,
337    ) -> anyhow::Result<()> {
338        let mut guard = self.automation_governance.write().await;
339        if paused {
340            if !guard.paused_agents.iter().any(|value| value == agent_id) {
341                guard.paused_agents.push(agent_id.to_string());
342            }
343        } else {
344            guard.paused_agents.retain(|value| value != agent_id);
345        }
346        guard.updated_at_ms = now_ms();
347        drop(guard);
348        self.persist_automation_governance().await?;
349        Ok(())
350    }
351
352    pub async fn can_create_automation_for_actor(
353        &self,
354        actor: &GovernanceActorRef,
355        provenance: &AutomationProvenanceRecord,
356        declared_capabilities: &AutomationDeclaredCapabilities,
357    ) -> Result<(), GovernanceError> {
358        let guard = self.automation_governance.read().await;
359        let limits = &guard.limits;
360        if !limits.creation_enabled {
361            return Err(GovernanceError::forbidden(
362                "AUTOMATION_V2_CREATION_DISABLED",
363                "agent automation creation is disabled for this tenant",
364            ));
365        }
366        if matches!(actor.kind, GovernanceActorKind::Agent) {
367            let agent_id = actor.actor_id.as_deref().unwrap_or_default();
368            if agent_id.is_empty() {
369                return Err(GovernanceError::forbidden(
370                    "AUTOMATION_V2_AGENT_ID_REQUIRED",
371                    "agent automation creation requires an agent identifier",
372                ));
373            }
374            if guard.is_agent_paused(agent_id) {
375                return Err(GovernanceError::forbidden(
376                    "AUTOMATION_V2_AGENT_CREATION_PAUSED",
377                    "this agent is paused from creating automations",
378                ));
379            }
380            if guard.is_agent_spend_paused(agent_id)
381                && !guard.has_approved_agent_quota_override(agent_id)
382            {
383                return Err(GovernanceError::too_many_requests(
384                    "AUTOMATION_V2_AGENT_SPEND_CAP_EXCEEDED",
385                    "this agent is paused after reaching its spend cap",
386                ));
387            }
388            if guard
389                .agent_creation_reviews
390                .get(agent_id)
391                .is_some_and(|summary| summary.review_required)
392            {
393                return Err(GovernanceError::too_many_requests(
394                    "AUTOMATION_V2_AGENT_REVIEW_REQUIRED",
395                    format!(
396                        "agent {} must be reviewed before creating additional automations",
397                        agent_id
398                    ),
399                ));
400            }
401            self.validate_declared_capabilities_for_agent(
402                &guard,
403                agent_id,
404                declared_capabilities,
405                None,
406            )?;
407            if provenance.depth > limits.lineage_depth_limit {
408                return Err(GovernanceError::forbidden(
409                    "AUTOMATION_V2_LINEAGE_DEPTH_EXCEEDED",
410                    format!(
411                        "lineage depth {} exceeds configured limit {}",
412                        provenance.depth, limits.lineage_depth_limit
413                    ),
414                ));
415            }
416            let window_start = now_ms().saturating_sub(24 * 60 * 60 * 1000);
417            let created_today = guard
418                .records
419                .values()
420                .filter(|record| {
421                    record.deleted_at_ms.is_none()
422                        && record.provenance.creator.kind == GovernanceActorKind::Agent
423                        && record
424                            .provenance
425                            .creator
426                            .actor_id
427                            .as_deref()
428                            .is_some_and(|value| value == agent_id)
429                        && record.created_at_ms >= window_start
430                })
431                .count() as u64;
432            if created_today >= limits.per_agent_daily_creation_limit {
433                return Err(GovernanceError::too_many_requests(
434                    "AUTOMATION_V2_AGENT_DAILY_QUOTA_EXCEEDED",
435                    format!(
436                        "agent {} has reached the daily automation creation quota",
437                        agent_id
438                    ),
439                ));
440            }
441            let active_agent_created = guard
442                .records
443                .values()
444                .filter(|record| {
445                    record.deleted_at_ms.is_none()
446                        && record.provenance.creator.kind == GovernanceActorKind::Agent
447                })
448                .count() as u64;
449            if active_agent_created >= limits.active_agent_automation_cap {
450                return Err(GovernanceError::too_many_requests(
451                    "AUTOMATION_V2_AGENT_CAP_EXCEEDED",
452                    "tenant has reached the active agent-authored automation cap",
453                ));
454            }
455        }
456        Ok(())
457    }
458
459    fn validate_declared_capabilities_for_agent(
460        &self,
461        guard: &GovernanceState,
462        agent_id: &str,
463        declared_capabilities: &AutomationDeclaredCapabilities,
464        previous_capabilities: Option<&AutomationDeclaredCapabilities>,
465    ) -> Result<(), GovernanceError> {
466        let previous = previous_capabilities.cloned().unwrap_or_default();
467        for capability in declared_capabilities.escalates_from(&previous) {
468            if !guard.has_approved_agent_capability(agent_id, capability) {
469                return Err(GovernanceError::forbidden(
470                    "AUTOMATION_V2_CAPABILITY_ESCALATION_FORBIDDEN",
471                    format!(
472                        "agent {} lacks approval for capability {}",
473                        agent_id, capability
474                    ),
475                ));
476            }
477        }
478        Ok(())
479    }
480
481    pub async fn can_escalate_declared_capabilities(
482        &self,
483        actor: &GovernanceActorRef,
484        previous: &AutomationDeclaredCapabilities,
485        next: &AutomationDeclaredCapabilities,
486    ) -> Result<(), GovernanceError> {
487        if matches!(actor.kind, GovernanceActorKind::Human) {
488            return Ok(());
489        }
490        let Some(agent_id) = actor.actor_id.as_deref() else {
491            return Err(GovernanceError::forbidden(
492                "AUTOMATION_V2_AGENT_ID_REQUIRED",
493                "agent automation requests require an agent identifier",
494            ));
495        };
496        let guard = self.automation_governance.read().await;
497        self.validate_declared_capabilities_for_agent(&guard, agent_id, next, Some(previous))
498    }
499
500    pub async fn can_mutate_automation(
501        &self,
502        automation_id: &str,
503        actor: &GovernanceActorRef,
504        destructive: bool,
505    ) -> Result<AutomationGovernanceRecord, GovernanceError> {
506        let guard = self.automation_governance.read().await;
507        let Some(record) = guard.records.get(automation_id).cloned() else {
508            return Err(GovernanceError::forbidden(
509                "AUTOMATION_V2_GOVERNANCE_MISSING",
510                "automation governance record not found",
511            ));
512        };
513        if matches!(actor.kind, GovernanceActorKind::Human) {
514            return Ok(record);
515        }
516        let Some(actor_id) = actor.actor_id.as_deref() else {
517            return Err(GovernanceError::forbidden(
518                "AUTOMATION_V2_AGENT_ID_REQUIRED",
519                "agent automation requests require an agent identifier",
520            ));
521        };
522        if record.retired_at_ms.is_some() {
523            return Err(GovernanceError::forbidden(
524                "AUTOMATION_V2_RETIRED",
525                "retired automations are not mutable by agents",
526            ));
527        }
528        if record.expired_at_ms.is_some() && record.paused_for_lifecycle {
529            return Err(GovernanceError::forbidden(
530                "AUTOMATION_V2_EXPIRED",
531                "expired automations are paused pending human review",
532            ));
533        }
534        if record.paused_for_lifecycle {
535            return Err(GovernanceError::forbidden(
536                "AUTOMATION_V2_LIFECYCLE_PAUSED",
537                "paused automations are not mutable by agents",
538            ));
539        }
540        if destructive {
541            if record.provenance.creator.kind != GovernanceActorKind::Agent {
542                return Err(GovernanceError::forbidden(
543                    "AUTOMATION_V2_DELETE_HUMAN_CREATED_DENIED",
544                    "agents cannot delete human-created automations",
545                ));
546            }
547            if record.provenance.creator.actor_id.as_deref() != Some(actor_id) {
548                return Err(GovernanceError::forbidden(
549                    "AUTOMATION_V2_DELETE_NOT_OWNER",
550                    "agents can only delete automations they created",
551                ));
552            }
553            return Ok(record);
554        }
555        if record.provenance.creator.kind == GovernanceActorKind::Agent
556            && record.provenance.creator.actor_id.as_deref() == Some(actor_id)
557        {
558            return Ok(record);
559        }
560        if record.has_modify_grant(actor_id) {
561            return Ok(record);
562        }
563        Err(GovernanceError::forbidden(
564            "AUTOMATION_V2_MODIFY_FORBIDDEN",
565            "agent lacks modify rights for this automation",
566        ))
567    }
568
569    pub async fn record_automation_creation(
570        &self,
571        automation: &crate::AutomationV2Spec,
572        provenance: AutomationProvenanceRecord,
573    ) -> anyhow::Result<AutomationGovernanceRecord> {
574        let mut record = AutomationGovernanceRecord {
575            automation_id: automation.automation_id.clone(),
576            provenance,
577            declared_capabilities: declared_capabilities_for_automation(automation),
578            modify_grants: Vec::new(),
579            capability_grants: Vec::new(),
580            created_at_ms: automation.created_at_ms,
581            updated_at_ms: now_ms(),
582            deleted_at_ms: None,
583            delete_retention_until_ms: None,
584            published_externally: false,
585            creation_paused: false,
586            review_required: false,
587            review_kind: None,
588            review_requested_at_ms: None,
589            review_request_id: None,
590            last_reviewed_at_ms: None,
591            runs_since_review: 0,
592            expires_at_ms: None,
593            expired_at_ms: None,
594            retired_at_ms: None,
595            retire_reason: None,
596            paused_for_lifecycle: false,
597            health_last_checked_at_ms: None,
598            health_findings: Vec::new(),
599        };
600        if record.expires_at_ms.is_none()
601            && record.provenance.creator.kind == GovernanceActorKind::Agent
602        {
603            let default_expires_after_ms = self
604                .automation_governance
605                .read()
606                .await
607                .limits
608                .default_expires_after_ms;
609            if default_expires_after_ms > 0 {
610                record.expires_at_ms = Some(now_ms().saturating_add(default_expires_after_ms));
611            }
612        }
613        let stored = self.upsert_automation_governance(record).await?;
614        if let Some(agent_id) = stored
615            .provenance
616            .creator
617            .actor_id
618            .as_deref()
619            .filter(|_| stored.provenance.creator.kind == GovernanceActorKind::Agent)
620        {
621            let _ = self
622                .record_agent_creation_review_progress(agent_id, &stored.automation_id)
623                .await;
624        }
625        Ok(stored)
626    }
627
628    pub async fn grant_automation_modify_access(
629        &self,
630        automation_id: &str,
631        granted_to: GovernanceActorRef,
632        granted_by: GovernanceActorRef,
633        reason: Option<String>,
634    ) -> anyhow::Result<AutomationGrantRecord> {
635        let grant = {
636            let mut guard = self.automation_governance.write().await;
637            let grant = {
638                let Some(record) = guard.records.get_mut(automation_id) else {
639                    anyhow::bail!("automation governance record not found");
640                };
641                let grant = AutomationGrantRecord {
642                    grant_id: format!("grant-{}", Uuid::new_v4()),
643                    automation_id: automation_id.to_string(),
644                    grant_kind: AutomationGrantKind::Modify,
645                    granted_to,
646                    granted_by,
647                    capability_key: None,
648                    created_at_ms: now_ms(),
649                    revoked_at_ms: None,
650                    revoke_reason: reason,
651                };
652                record.modify_grants.push(grant.clone());
653                record.updated_at_ms = now_ms();
654                grant
655            };
656            guard.updated_at_ms = now_ms();
657            grant
658        };
659        self.persist_automation_governance().await?;
660        let _ = append_protected_audit_event(
661            self,
662            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.grant.created"),
663            &tandem_types::TenantContext::local_implicit(),
664            grant
665                .granted_by
666                .actor_id
667                .clone()
668                .or_else(|| grant.granted_by.source.clone()),
669            json!({
670                "automationID": automation_id,
671                "grant": grant,
672            }),
673        )
674        .await;
675        Ok(grant)
676    }
677
678    pub async fn revoke_automation_modify_access(
679        &self,
680        automation_id: &str,
681        grant_id: &str,
682        revoked_by: GovernanceActorRef,
683        reason: Option<String>,
684    ) -> anyhow::Result<Option<AutomationGrantRecord>> {
685        let stored = {
686            let mut guard = self.automation_governance.write().await;
687            let stored = {
688                let Some(record) = guard.records.get_mut(automation_id) else {
689                    anyhow::bail!("automation governance record not found");
690                };
691                let Some(grant) = record
692                    .modify_grants
693                    .iter_mut()
694                    .find(|grant| grant.grant_id == grant_id && grant.revoked_at_ms.is_none())
695                else {
696                    return Ok(None);
697                };
698                grant.revoked_at_ms = Some(now_ms());
699                grant.revoke_reason = reason.clone();
700                record.updated_at_ms = now_ms();
701                grant.clone()
702            };
703            guard.updated_at_ms = now_ms();
704            stored
705        };
706        self.persist_automation_governance().await?;
707        let _ = append_protected_audit_event(
708            self,
709            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.grant.revoked"),
710            &tandem_types::TenantContext::local_implicit(),
711            revoked_by
712                .actor_id
713                .clone()
714                .or_else(|| revoked_by.source.clone()),
715            json!({
716                "automationID": automation_id,
717                "grantID": grant_id,
718                "reason": reason,
719            }),
720        )
721        .await;
722        Ok(Some(stored))
723    }
724
725    pub async fn request_approval(
726        &self,
727        request_type: GovernanceApprovalRequestType,
728        requested_by: GovernanceActorRef,
729        target_resource: GovernanceResourceRef,
730        rationale: String,
731        context: Value,
732        expires_at_ms: Option<u64>,
733    ) -> anyhow::Result<GovernanceApprovalRequest> {
734        let now = now_ms();
735        let approval_ttl_ms = self
736            .automation_governance
737            .read()
738            .await
739            .limits
740            .approval_ttl_ms;
741        let expires_at_ms = expires_at_ms.unwrap_or_else(|| now.saturating_add(approval_ttl_ms));
742        let request = GovernanceApprovalRequest {
743            approval_id: format!("apr_{}", Uuid::new_v4().simple()),
744            request_type,
745            requested_by,
746            target_resource,
747            rationale,
748            context,
749            status: GovernanceApprovalStatus::Pending,
750            expires_at_ms,
751            reviewed_by: None,
752            reviewed_at_ms: None,
753            review_notes: None,
754            created_at_ms: now,
755            updated_at_ms: now,
756        };
757        {
758            let mut guard = self.automation_governance.write().await;
759            guard
760                .approvals
761                .insert(request.approval_id.clone(), request.clone());
762            guard.updated_at_ms = now;
763        }
764        self.persist_automation_governance().await?;
765        let _ = append_protected_audit_event(
766            self,
767            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.approval.requested"),
768            &tandem_types::TenantContext::local_implicit(),
769            request
770                .requested_by
771                .actor_id
772                .clone()
773                .or_else(|| request.requested_by.source.clone()),
774            json!({
775                "approvalID": request.approval_id,
776                "request": request,
777            }),
778        )
779        .await;
780        Ok(request)
781    }
782
783    pub async fn list_approval_requests(
784        &self,
785        request_type: Option<GovernanceApprovalRequestType>,
786        status: Option<GovernanceApprovalStatus>,
787    ) -> Vec<GovernanceApprovalRequest> {
788        let mut rows = self
789            .automation_governance
790            .read()
791            .await
792            .approvals
793            .values()
794            .filter(|request| {
795                request_type
796                    .map(|value| request.request_type == value)
797                    .unwrap_or(true)
798                    && status.map(|value| request.status == value).unwrap_or(true)
799            })
800            .cloned()
801            .collect::<Vec<_>>();
802        rows.sort_by(|a, b| b.updated_at_ms.cmp(&a.updated_at_ms));
803        rows
804    }
805
806    pub async fn decide_approval_request(
807        &self,
808        approval_id: &str,
809        reviewer: GovernanceActorRef,
810        approved: bool,
811        notes: Option<String>,
812    ) -> anyhow::Result<Option<GovernanceApprovalRequest>> {
813        let stored = {
814            let mut guard = self.automation_governance.write().await;
815            let stored = {
816                let Some(request) = guard.approvals.get_mut(approval_id) else {
817                    return Ok(None);
818                };
819                if request.status != GovernanceApprovalStatus::Pending {
820                    return Ok(Some(request.clone()));
821                }
822                let now = now_ms();
823                request.status = if approved {
824                    GovernanceApprovalStatus::Approved
825                } else {
826                    GovernanceApprovalStatus::Denied
827                };
828                request.reviewed_by = Some(reviewer.clone());
829                request.reviewed_at_ms = Some(now);
830                request.review_notes = notes.clone();
831                request.updated_at_ms = now;
832                request.clone()
833            };
834            guard.updated_at_ms = now_ms();
835            stored
836        };
837        self.persist_automation_governance().await?;
838        let _ = append_protected_audit_event(
839            self,
840            format!(
841                "{GOVERNANCE_AUDIT_EVENT_PREFIX}.approval.{}",
842                if approved { "approved" } else { "denied" }
843            ),
844            &tandem_types::TenantContext::local_implicit(),
845            reviewer
846                .actor_id
847                .clone()
848                .or_else(|| reviewer.source.clone()),
849            json!({
850                "approvalID": approval_id,
851                "approval": stored,
852            }),
853        )
854        .await;
855        Ok(Some(stored))
856    }
857
858    pub async fn delete_automation_v2_with_governance(
859        &self,
860        automation_id: &str,
861        deleted_by: GovernanceActorRef,
862    ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
863        let _guard = self.automations_v2_persistence.lock().await;
864        let removed = self.automations_v2.write().await.remove(automation_id);
865        if let Some(automation) = removed.clone() {
866            let now = now_ms();
867            {
868                let mut governance = self.automation_governance.write().await;
869                let record = governance
870                    .records
871                    .entry(automation_id.to_string())
872                    .or_insert_with(|| AutomationGovernanceRecord {
873                        automation_id: automation_id.to_string(),
874                        provenance: default_human_provenance(
875                            Some(automation.creator_id.clone()),
876                            "delete_default",
877                        ),
878                        declared_capabilities: declared_capabilities_for_automation(&automation),
879                        modify_grants: Vec::new(),
880                        capability_grants: Vec::new(),
881                        created_at_ms: automation.created_at_ms,
882                        updated_at_ms: now,
883                        deleted_at_ms: None,
884                        delete_retention_until_ms: None,
885                        published_externally: false,
886                        creation_paused: false,
887                        review_required: false,
888                        review_kind: None,
889                        review_requested_at_ms: None,
890                        review_request_id: None,
891                        last_reviewed_at_ms: None,
892                        runs_since_review: 0,
893                        expires_at_ms: None,
894                        expired_at_ms: None,
895                        retired_at_ms: None,
896                        retire_reason: None,
897                        paused_for_lifecycle: false,
898                        health_last_checked_at_ms: None,
899                        health_findings: Vec::new(),
900                    });
901                record.deleted_at_ms = Some(now);
902                record.delete_retention_until_ms =
903                    Some(now.saturating_add(7 * 24 * 60 * 60 * 1000));
904                record.updated_at_ms = now;
905                governance.deleted_automations.insert(
906                    automation_id.to_string(),
907                    DeletedAutomationRecord {
908                        automation: automation.clone(),
909                        deleted_at_ms: now,
910                        deleted_by: deleted_by.clone(),
911                        restore_until_ms: now.saturating_add(7 * 24 * 60 * 60 * 1000),
912                    },
913                );
914                governance.updated_at_ms = now;
915            }
916            self.persist_automation_governance().await?;
917            self.persist_automations_v2_locked().await?;
918            let _ = append_protected_audit_event(
919                self,
920                format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.deleted"),
921                &tandem_types::TenantContext::local_implicit(),
922                deleted_by
923                    .actor_id
924                    .clone()
925                    .or_else(|| deleted_by.source.clone()),
926                json!({
927                    "automationID": automation_id,
928                    "deletedBy": deleted_by,
929                    "deletedAtMs": now,
930                }),
931            )
932            .await;
933        }
934        Ok(removed)
935    }
936
937    pub async fn restore_deleted_automation_v2(
938        &self,
939        automation_id: &str,
940    ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
941        let restored = {
942            let mut governance = self.automation_governance.write().await;
943            let Some(deleted) = governance.deleted_automations.remove(automation_id) else {
944                return Ok(None);
945            };
946            let automation = deleted.automation.clone();
947            self.automations_v2
948                .write()
949                .await
950                .insert(automation_id.to_string(), automation.clone());
951            if let Some(record) = governance.records.get_mut(automation_id) {
952                record.deleted_at_ms = None;
953                record.delete_retention_until_ms = None;
954                record.updated_at_ms = now_ms();
955            }
956            governance.updated_at_ms = now_ms();
957            automation
958        };
959        self.persist_automation_governance().await?;
960        self.persist_automations_v2().await?;
961        Ok(Some(restored))
962    }
963
964    pub async fn agent_spend_summary(&self, agent_id: &str) -> Option<AgentSpendSummary> {
965        self.automation_governance
966            .read()
967            .await
968            .agent_spend_summary(agent_id)
969    }
970
971    pub async fn list_agent_spend_summaries(&self) -> Vec<AgentSpendSummary> {
972        self.automation_governance
973            .read()
974            .await
975            .agent_spend_summaries()
976    }
977
978    pub async fn agent_creation_review_summary(
979        &self,
980        agent_id: &str,
981    ) -> Option<AgentCreationReviewSummary> {
982        self.automation_governance
983            .read()
984            .await
985            .agent_creation_review_summary(agent_id)
986    }
987
988    pub async fn list_agent_creation_review_summaries(&self) -> Vec<AgentCreationReviewSummary> {
989        self.automation_governance
990            .read()
991            .await
992            .agent_creation_review_summaries()
993    }
994
995    pub async fn record_agent_creation_review_progress(
996        &self,
997        agent_id: &str,
998        automation_id: &str,
999    ) -> anyhow::Result<()> {
1000        let now = now_ms();
1001        let (created_since_review, threshold, should_request) = {
1002            let mut guard = self.automation_governance.write().await;
1003            let threshold = guard.limits.per_agent_creation_review_threshold;
1004            let (created_since_review, should_request) = {
1005                let summary = guard
1006                    .agent_creation_reviews
1007                    .entry(agent_id.to_string())
1008                    .or_insert_with(|| AgentCreationReviewSummary::new(agent_id.to_string(), now));
1009                summary.created_since_review = summary.created_since_review.saturating_add(1);
1010                summary.updated_at_ms = now;
1011                let should_request = threshold > 0
1012                    && summary.created_since_review >= threshold
1013                    && !summary.review_required;
1014                if should_request {
1015                    summary.review_required = true;
1016                    summary.review_kind = Some(AutomationLifecycleReviewKind::CreationQuota);
1017                    summary.review_requested_at_ms = Some(now);
1018                }
1019                (summary.created_since_review, should_request)
1020            };
1021            guard.updated_at_ms = now;
1022            (created_since_review, threshold, should_request)
1023        };
1024        self.persist_automation_governance().await?;
1025        if should_request {
1026            let _ = self
1027                .request_approval(
1028                    GovernanceApprovalRequestType::LifecycleReview,
1029                    GovernanceActorRef::system("automation_creation_review"),
1030                    GovernanceResourceRef {
1031                        resource_type: "agent".to_string(),
1032                        id: agent_id.to_string(),
1033                    },
1034                    format!(
1035                        "Human acknowledgment required after agent {agent_id} created {created_since_review} automations"
1036                    ),
1037                    json!({
1038                        "trigger": "creation_quota",
1039                        "agentID": agent_id,
1040                        "automationID": automation_id,
1041                        "createdSinceReview": created_since_review,
1042                        "creationReviewThreshold": threshold,
1043                    }),
1044                    None,
1045                )
1046                .await;
1047        }
1048        Ok(())
1049    }
1050
1051    pub async fn acknowledge_agent_creation_review(
1052        &self,
1053        agent_id: &str,
1054        reviewer: GovernanceActorRef,
1055        notes: Option<String>,
1056    ) -> anyhow::Result<()> {
1057        let now = now_ms();
1058        {
1059            let mut guard = self.automation_governance.write().await;
1060            let summary = guard
1061                .agent_creation_reviews
1062                .entry(agent_id.to_string())
1063                .or_insert_with(|| AgentCreationReviewSummary::new(agent_id.to_string(), now));
1064            summary.created_since_review = 0;
1065            summary.review_required = false;
1066            summary.review_kind = None;
1067            summary.review_requested_at_ms = None;
1068            summary.review_request_id = None;
1069            summary.last_reviewed_at_ms = Some(now);
1070            summary.last_review_notes = notes.clone();
1071            summary.updated_at_ms = now;
1072            guard.updated_at_ms = now;
1073        }
1074        self.persist_automation_governance().await?;
1075        let _ = append_protected_audit_event(
1076            self,
1077            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.review.agent_acknowledged"),
1078            &tandem_types::TenantContext::local_implicit(),
1079            reviewer
1080                .actor_id
1081                .clone()
1082                .or_else(|| reviewer.source.clone()),
1083            json!({
1084                "agentID": agent_id,
1085                "reviewer": reviewer,
1086                "notes": notes,
1087            }),
1088        )
1089        .await;
1090        Ok(())
1091    }
1092
1093    pub async fn acknowledge_automation_review(
1094        &self,
1095        automation_id: &str,
1096        reviewer: GovernanceActorRef,
1097        notes: Option<String>,
1098    ) -> anyhow::Result<Option<AutomationGovernanceRecord>> {
1099        let stored = {
1100            let mut guard = self.automation_governance.write().await;
1101            let stored = {
1102                let Some(record) = guard.records.get_mut(automation_id) else {
1103                    return Ok(None);
1104                };
1105                let now = now_ms();
1106                record.review_required = false;
1107                record.review_kind = None;
1108                record.review_requested_at_ms = None;
1109                record.review_request_id = None;
1110                record.last_reviewed_at_ms = Some(now);
1111                record.runs_since_review = 0;
1112                record.health_findings.clear();
1113                record.health_last_checked_at_ms = Some(now);
1114                record.updated_at_ms = now;
1115                record.clone()
1116            };
1117            guard.updated_at_ms = now_ms();
1118            stored
1119        };
1120        self.persist_automation_governance().await?;
1121        let _ = append_protected_audit_event(
1122            self,
1123            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.review.automation_acknowledged"),
1124            &tandem_types::TenantContext::local_implicit(),
1125            reviewer
1126                .actor_id
1127                .clone()
1128                .or_else(|| reviewer.source.clone()),
1129            json!({
1130                "automationID": automation_id,
1131                "reviewer": reviewer,
1132                "notes": notes,
1133            }),
1134        )
1135        .await;
1136        Ok(Some(stored))
1137    }
1138
1139    pub async fn pause_automation_for_dependency_revocation(
1140        &self,
1141        automation_id: &str,
1142        reason: String,
1143        evidence: Value,
1144    ) -> anyhow::Result<()> {
1145        let Some(automation) = self.get_automation_v2(automation_id).await else {
1146            anyhow::bail!("automation not found");
1147        };
1148        let now = now_ms();
1149        let paused_runs = self
1150            .pause_running_automation_v2_runs(
1151                automation_id,
1152                reason.clone(),
1153                crate::AutomationStopKind::GuardrailStopped,
1154            )
1155            .await;
1156
1157        let dependency_context = json!({
1158            "trigger": "dependency_revoked",
1159            "reason": reason.clone(),
1160            "evidence": evidence,
1161            "pausedRunIDs": paused_runs.clone(),
1162        });
1163        let finding = AutomationLifecycleFinding {
1164            finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1165            kind: AutomationLifecycleReviewKind::DependencyRevoked,
1166            severity: AutomationLifecycleFindingSeverity::Critical,
1167            summary: "automation paused after dependency revocation".to_string(),
1168            detail: Some(
1169                "an owned grant or connected MCP capability was removed and the automation was paused pending review"
1170                    .to_string(),
1171            ),
1172            observed_at_ms: now,
1173            automation_run_id: None,
1174            approval_id: None,
1175            evidence: Some(dependency_context.clone()),
1176        };
1177
1178        let pending_review_id = {
1179            let guard = self.automation_governance.read().await;
1180            guard
1181                .approvals
1182                .values()
1183                .filter(|request| {
1184                    request.request_type == GovernanceApprovalRequestType::LifecycleReview
1185                        && request.status == GovernanceApprovalStatus::Pending
1186                        && request.target_resource.resource_type == "automation"
1187                        && request.target_resource.id == automation_id
1188                })
1189                .max_by_key(|request| request.updated_at_ms)
1190                .map(|request| request.approval_id.clone())
1191        };
1192
1193        {
1194            let mut guard = self.automation_governance.write().await;
1195            let record = guard
1196                .records
1197                .entry(automation_id.to_string())
1198                .or_insert_with(|| AutomationGovernanceRecord {
1199                    automation_id: automation_id.to_string(),
1200                    provenance: default_human_provenance(
1201                        Some(automation.creator_id.clone()),
1202                        "dependency_revocation_default",
1203                    ),
1204                    declared_capabilities: declared_capabilities_for_automation(&automation),
1205                    modify_grants: Vec::new(),
1206                    capability_grants: Vec::new(),
1207                    created_at_ms: automation.created_at_ms,
1208                    updated_at_ms: now,
1209                    deleted_at_ms: None,
1210                    delete_retention_until_ms: None,
1211                    published_externally: false,
1212                    creation_paused: false,
1213                    review_required: false,
1214                    review_kind: None,
1215                    review_requested_at_ms: None,
1216                    review_request_id: None,
1217                    last_reviewed_at_ms: None,
1218                    runs_since_review: 0,
1219                    expires_at_ms: None,
1220                    expired_at_ms: None,
1221                    retired_at_ms: None,
1222                    retire_reason: None,
1223                    paused_for_lifecycle: false,
1224                    health_last_checked_at_ms: None,
1225                    health_findings: Vec::new(),
1226                });
1227            record.declared_capabilities = declared_capabilities_for_automation(&automation);
1228            record.paused_for_lifecycle = true;
1229            record.review_required = true;
1230            record.review_kind = Some(AutomationLifecycleReviewKind::DependencyRevoked);
1231            record.review_requested_at_ms = Some(now);
1232            record.review_request_id = pending_review_id.clone();
1233            record.health_last_checked_at_ms = Some(now);
1234            record.health_findings.push(finding.clone());
1235            record.updated_at_ms = now;
1236            guard.updated_at_ms = now;
1237        }
1238        self.persist_automation_governance().await?;
1239
1240        let mut created_review_id = pending_review_id;
1241        if created_review_id.is_none() {
1242            if let Ok(approval) = self
1243                .request_approval(
1244                    GovernanceApprovalRequestType::LifecycleReview,
1245                    GovernanceActorRef::system("automation_dependency_revocation"),
1246                    GovernanceResourceRef {
1247                        resource_type: "automation".to_string(),
1248                        id: automation_id.to_string(),
1249                    },
1250                    format!(
1251                        "Human review required after dependency revocation paused automation {automation_id}"
1252                    ),
1253                    dependency_context.clone(),
1254                    None,
1255                )
1256                .await
1257            {
1258                created_review_id = Some(approval.approval_id.clone());
1259                {
1260                    let mut guard = self.automation_governance.write().await;
1261                    if let Some(record) = guard.records.get_mut(automation_id) {
1262                        record.review_request_id = created_review_id.clone();
1263                        record.updated_at_ms = now_ms();
1264                    }
1265                    guard.updated_at_ms = now_ms();
1266                }
1267                self.persist_automation_governance().await?;
1268            }
1269        }
1270
1271        let _ = append_protected_audit_event(
1272            self,
1273            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.dependency_revoked"),
1274            &tandem_types::TenantContext::local_implicit(),
1275            Some("automation_dependency_revocation".to_string()),
1276            json!({
1277                "automationID": automation_id,
1278                "reason": reason,
1279                "pausedRunIDs": paused_runs,
1280                "evidence": dependency_context.clone(),
1281                "reviewRequestID": created_review_id,
1282            }),
1283        )
1284        .await;
1285
1286        Ok(())
1287    }
1288
1289    async fn pause_running_automation_v2_runs(
1290        &self,
1291        automation_id: &str,
1292        reason: String,
1293        stop_kind: crate::AutomationStopKind,
1294    ) -> Vec<String> {
1295        let runs = self.list_automation_v2_runs(Some(automation_id), 100).await;
1296        let mut paused_runs = Vec::new();
1297        for run in runs {
1298            if run.status != crate::AutomationRunStatus::Running {
1299                continue;
1300            }
1301            let session_ids = run.active_session_ids.clone();
1302            let instance_ids = run.active_instance_ids.clone();
1303            let _ = self
1304                .update_automation_v2_run(&run.run_id, |row| {
1305                    row.status = crate::AutomationRunStatus::Pausing;
1306                    row.pause_reason = Some(reason.clone());
1307                })
1308                .await;
1309            for session_id in &session_ids {
1310                let _ = self.cancellations.cancel(session_id).await;
1311            }
1312            for instance_id in instance_ids {
1313                let _ = self
1314                    .agent_teams
1315                    .cancel_instance(self, &instance_id, &reason)
1316                    .await;
1317            }
1318            self.forget_automation_v2_sessions(&session_ids).await;
1319            let _ = self
1320                .update_automation_v2_run(&run.run_id, |row| {
1321                    row.status = crate::AutomationRunStatus::Paused;
1322                    row.active_session_ids.clear();
1323                    row.active_instance_ids.clear();
1324                    row.pause_reason = Some(reason.clone());
1325                    row.stop_kind = Some(stop_kind.clone());
1326                    row.stop_reason = Some(reason.clone());
1327                    crate::app::state::automation::lifecycle::record_automation_lifecycle_event(
1328                        row,
1329                        "run_paused_governance",
1330                        Some(reason.clone()),
1331                        Some(stop_kind.clone()),
1332                    );
1333                })
1334                .await;
1335            paused_runs.push(run.run_id);
1336        }
1337        paused_runs
1338    }
1339
1340    pub async fn record_automation_review_progress(
1341        &self,
1342        automation_id: &str,
1343        reason: AutomationLifecycleReviewKind,
1344        run_id: Option<String>,
1345        detail: Option<String>,
1346    ) -> anyhow::Result<()> {
1347        let now = now_ms();
1348        let (should_request, review_count) = {
1349            let mut guard = self.automation_governance.write().await;
1350            let threshold = guard.limits.run_review_threshold;
1351            let (should_request, review_count) = {
1352                let Some(record) = guard.records.get_mut(automation_id) else {
1353                    return Ok(());
1354                };
1355                record.runs_since_review = record.runs_since_review.saturating_add(1);
1356                record.health_last_checked_at_ms = Some(now);
1357                record.updated_at_ms = now;
1358                let should_request = threshold > 0
1359                    && record.runs_since_review >= threshold
1360                    && !record.review_required;
1361                if should_request {
1362                    record.review_required = true;
1363                    record.review_kind = Some(reason);
1364                    record.review_requested_at_ms = Some(now);
1365                }
1366                (should_request, record.runs_since_review)
1367            };
1368            guard.updated_at_ms = now;
1369            (should_request, review_count)
1370        };
1371        self.persist_automation_governance().await?;
1372        if should_request {
1373            let _ = self
1374                .request_approval(
1375                    GovernanceApprovalRequestType::LifecycleReview,
1376                    GovernanceActorRef::system("automation_lifecycle_review"),
1377                    GovernanceResourceRef {
1378                        resource_type: "automation".to_string(),
1379                        id: automation_id.to_string(),
1380                    },
1381                    format!(
1382                        "Human review required after automation {automation_id} completed {review_count} runs without acknowledgment"
1383                    ),
1384                    json!({
1385                        "trigger": "run_drift",
1386                        "automationID": automation_id,
1387                        "runID": run_id,
1388                        "detail": detail,
1389                        "runCountSinceReview": review_count,
1390                        "reviewKind": "run_drift",
1391                    }),
1392                    None,
1393                )
1394                .await;
1395        }
1396        Ok(())
1397    }
1398
1399    pub async fn run_automation_governance_health_check(&self) -> anyhow::Result<usize> {
1400        let now = now_ms();
1401        let limits = self.automation_governance.read().await.limits.clone();
1402        let automations = self.list_automations_v2().await;
1403        let mut finding_count = 0usize;
1404
1405        for automation in automations {
1406            let runs = self
1407                .list_automation_v2_runs(
1408                    Some(&automation.automation_id),
1409                    limits.health_window_run_limit.max(5) as usize,
1410                )
1411                .await;
1412            let terminal_runs = runs
1413                .iter()
1414                .filter(|run| {
1415                    matches!(
1416                        run.status,
1417                        crate::AutomationRunStatus::Completed
1418                            | crate::AutomationRunStatus::Blocked
1419                            | crate::AutomationRunStatus::Failed
1420                            | crate::AutomationRunStatus::Cancelled
1421                    )
1422                })
1423                .collect::<Vec<_>>();
1424            let failure_count = terminal_runs
1425                .iter()
1426                .filter(|run| {
1427                    matches!(
1428                        run.status,
1429                        crate::AutomationRunStatus::Failed | crate::AutomationRunStatus::Blocked
1430                    )
1431                })
1432                .count();
1433            let empty_output_count = terminal_runs
1434                .iter()
1435                .filter(|run| {
1436                    run.status == crate::AutomationRunStatus::Completed
1437                        && run.checkpoint.node_outputs.is_empty()
1438                })
1439                .count();
1440            let guardrail_stop_count = terminal_runs
1441                .iter()
1442                .filter(|run| run.stop_kind == Some(crate::AutomationStopKind::GuardrailStopped))
1443                .count();
1444
1445            let mut findings = Vec::new();
1446            let mut automation_expires_at_ms = None;
1447            if !terminal_runs.is_empty() {
1448                let failure_rate = failure_count as f64 / terminal_runs.len() as f64;
1449                if failure_rate >= limits.health_failure_rate_threshold && terminal_runs.len() >= 5
1450                {
1451                    findings.push(AutomationLifecycleFinding {
1452                        finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1453                        kind: AutomationLifecycleReviewKind::HealthDrift,
1454                        severity: if failure_rate >= 0.75 {
1455                            AutomationLifecycleFindingSeverity::Critical
1456                        } else {
1457                            AutomationLifecycleFindingSeverity::Warning
1458                        },
1459                        summary: "high failure rate across recent runs".to_string(),
1460                        detail: Some(format!(
1461                            "{} of {} recent terminal runs failed or were blocked ({:.0}% failure rate)",
1462                            failure_count,
1463                            terminal_runs.len(),
1464                            failure_rate * 100.0
1465                        )),
1466                        observed_at_ms: now,
1467                        automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1468                        approval_id: None,
1469                        evidence: Some(json!({
1470                            "failureCount": failure_count,
1471                            "terminalRunCount": terminal_runs.len(),
1472                            "failureRate": failure_rate,
1473                        })),
1474                    });
1475                }
1476            }
1477            if empty_output_count > 0 {
1478                findings.push(AutomationLifecycleFinding {
1479                    finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1480                    kind: AutomationLifecycleReviewKind::HealthDrift,
1481                    severity: AutomationLifecycleFindingSeverity::Warning,
1482                    summary: "completed runs emitted empty outputs".to_string(),
1483                    detail: Some(format!(
1484                        "{} recent completed runs produced no node outputs",
1485                        empty_output_count
1486                    )),
1487                    observed_at_ms: now,
1488                    automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1489                    approval_id: None,
1490                    evidence: Some(json!({
1491                        "emptyOutputCount": empty_output_count,
1492                    })),
1493                });
1494            }
1495            if guardrail_stop_count >= limits.health_guardrail_stop_threshold as usize
1496                && limits.health_guardrail_stop_threshold > 0
1497            {
1498                findings.push(AutomationLifecycleFinding {
1499                    finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1500                    kind: AutomationLifecycleReviewKind::HealthDrift,
1501                    severity: AutomationLifecycleFindingSeverity::Warning,
1502                    summary: "repeated guardrail stops detected".to_string(),
1503                    detail: Some(format!(
1504                        "{} recent terminal runs stopped on guardrails",
1505                        guardrail_stop_count
1506                    )),
1507                    observed_at_ms: now,
1508                    automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1509                    approval_id: None,
1510                    evidence: Some(json!({
1511                        "guardrailStopCount": guardrail_stop_count,
1512                    })),
1513                });
1514            }
1515
1516            let mut should_create_review_request = false;
1517            let mut should_create_retirement_request = false;
1518            let mut should_pause_expired = false;
1519            {
1520                let mut guard = self.automation_governance.write().await;
1521                let has_pending_lifecycle_review = guard.has_pending_approval_request(
1522                    GovernanceApprovalRequestType::LifecycleReview,
1523                    "automation",
1524                    &automation.automation_id,
1525                );
1526                let has_pending_retirement_request = guard.has_pending_approval_request(
1527                    GovernanceApprovalRequestType::RetirementAction,
1528                    "automation",
1529                    &automation.automation_id,
1530                );
1531                let Some(record) = guard.records.get_mut(&automation.automation_id) else {
1532                    continue;
1533                };
1534                automation_expires_at_ms = record.expires_at_ms;
1535                record.health_last_checked_at_ms = Some(now);
1536                record.health_findings = findings.clone();
1537                if !findings.is_empty() {
1538                    record.review_required = true;
1539                    record.review_kind = Some(AutomationLifecycleReviewKind::HealthDrift);
1540                    if record.review_requested_at_ms.is_none() {
1541                        record.review_requested_at_ms = Some(now);
1542                    }
1543                    should_create_review_request = !has_pending_lifecycle_review;
1544                }
1545                if let Some(expires_at_ms) = record.expires_at_ms {
1546                    if now >= expires_at_ms && record.expired_at_ms.is_none() {
1547                        record.expired_at_ms = Some(now);
1548                        record.review_required = true;
1549                        record.review_kind = Some(AutomationLifecycleReviewKind::Expired);
1550                        record.review_requested_at_ms = Some(now);
1551                        record.paused_for_lifecycle = true;
1552                        should_pause_expired = true;
1553                        should_create_retirement_request = !has_pending_retirement_request;
1554                        findings.push(AutomationLifecycleFinding {
1555                            finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1556                            kind: AutomationLifecycleReviewKind::Expired,
1557                            severity: AutomationLifecycleFindingSeverity::Critical,
1558                            summary: "automation has expired and was paused".to_string(),
1559                            detail: Some(format!(
1560                                "automation expired at {} and has been paused for human review",
1561                                expires_at_ms
1562                            )),
1563                            observed_at_ms: now,
1564                            automation_run_id: terminal_runs.last().map(|run| run.run_id.clone()),
1565                            approval_id: None,
1566                            evidence: Some(json!({
1567                                "expiresAtMs": expires_at_ms,
1568                                "expiredAtMs": now,
1569                            })),
1570                        });
1571                    } else if expires_at_ms > now
1572                        && expires_at_ms.saturating_sub(now) <= limits.expiration_warning_window_ms
1573                    {
1574                        record.review_required = true;
1575                        record.review_kind = Some(AutomationLifecycleReviewKind::ExpirationSoon);
1576                        if record.review_requested_at_ms.is_none() {
1577                            record.review_requested_at_ms = Some(now);
1578                        }
1579                        should_create_retirement_request = !has_pending_retirement_request;
1580                        findings.push(AutomationLifecycleFinding {
1581                            finding_id: format!("finding-{}", uuid::Uuid::new_v4().simple()),
1582                            kind: AutomationLifecycleReviewKind::ExpirationSoon,
1583                            severity: AutomationLifecycleFindingSeverity::Info,
1584                            summary: "automation is approaching its expiration date".to_string(),
1585                            detail: Some(format!(
1586                                "automation expires in {}ms",
1587                                expires_at_ms.saturating_sub(now)
1588                            )),
1589                            observed_at_ms: now,
1590                            automation_run_id: None,
1591                            approval_id: None,
1592                            evidence: Some(json!({
1593                                "expiresAtMs": expires_at_ms,
1594                                "warningWindowMs": limits.expiration_warning_window_ms,
1595                            })),
1596                        });
1597                    }
1598                }
1599                record.health_findings = findings.clone();
1600                record.updated_at_ms = now;
1601                guard.updated_at_ms = now;
1602            }
1603            self.persist_automation_governance().await?;
1604
1605            if should_pause_expired && automation.status != crate::AutomationV2Status::Paused {
1606                let mut paused = automation.clone();
1607                paused.status = crate::AutomationV2Status::Paused;
1608                let _ = self.put_automation_v2(paused).await;
1609                let _ = self
1610                    .pause_running_automation_v2_runs(
1611                        &automation.automation_id,
1612                        format!(
1613                            "automation expired after reaching {}ms retention",
1614                            limits.default_expires_after_ms
1615                        ),
1616                        crate::AutomationStopKind::GuardrailStopped,
1617                    )
1618                    .await;
1619            }
1620
1621            if should_create_review_request {
1622                let _ = self
1623                    .request_approval(
1624                        GovernanceApprovalRequestType::LifecycleReview,
1625                        GovernanceActorRef::system("automation_health_check"),
1626                        GovernanceResourceRef {
1627                            resource_type: "automation".to_string(),
1628                            id: automation.automation_id.clone(),
1629                        },
1630                        format!(
1631                            "Human review required after health check detected drift in automation {}",
1632                            automation.automation_id
1633                        ),
1634                        json!({
1635                            "trigger": "health_drift",
1636                            "automationID": automation.automation_id,
1637                            "findingCount": findings.len(),
1638                        }),
1639                        None,
1640                    )
1641                    .await;
1642            }
1643
1644            if should_create_retirement_request {
1645                let _ = self
1646                    .request_approval(
1647                        GovernanceApprovalRequestType::RetirementAction,
1648                        GovernanceActorRef::system("automation_expiration"),
1649                        GovernanceResourceRef {
1650                            resource_type: "automation".to_string(),
1651                            id: automation.automation_id.clone(),
1652                        },
1653                        format!(
1654                            "Automation {} is expiring or has expired and needs operator action",
1655                            automation.automation_id
1656                        ),
1657                        json!({
1658                            "trigger": if should_pause_expired {
1659                                "expired"
1660                            } else {
1661                                "expiration_soon"
1662                            },
1663                            "automationID": automation.automation_id,
1664                            "expiresAtMs": automation_expires_at_ms,
1665                        }),
1666                        None,
1667                    )
1668                    .await;
1669            }
1670
1671            finding_count += findings.len();
1672        }
1673
1674        Ok(finding_count)
1675    }
1676
1677    pub async fn retire_automation_v2(
1678        &self,
1679        automation_id: &str,
1680        actor: GovernanceActorRef,
1681        reason: Option<String>,
1682    ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
1683        let Some(mut automation) = self.get_automation_v2(automation_id).await else {
1684            return Ok(None);
1685        };
1686        let now = now_ms();
1687        let reason = reason.unwrap_or_else(|| "retired by operator".to_string());
1688        automation.status = crate::AutomationV2Status::Paused;
1689        let stored = self.put_automation_v2(automation).await?;
1690        let _ = self
1691            .pause_running_automation_v2_runs(
1692                automation_id,
1693                reason.clone(),
1694                crate::AutomationStopKind::OperatorStopped,
1695            )
1696            .await;
1697        {
1698            let mut guard = self.automation_governance.write().await;
1699            let record = guard
1700                .records
1701                .entry(automation_id.to_string())
1702                .or_insert_with(|| AutomationGovernanceRecord {
1703                    automation_id: automation_id.to_string(),
1704                    provenance: default_human_provenance(
1705                        Some(stored.creator_id.clone()),
1706                        "retire_default",
1707                    ),
1708                    declared_capabilities: declared_capabilities_for_automation(&stored),
1709                    modify_grants: Vec::new(),
1710                    capability_grants: Vec::new(),
1711                    created_at_ms: stored.created_at_ms,
1712                    updated_at_ms: now,
1713                    deleted_at_ms: None,
1714                    delete_retention_until_ms: None,
1715                    published_externally: false,
1716                    creation_paused: false,
1717                    review_required: false,
1718                    review_kind: None,
1719                    review_requested_at_ms: None,
1720                    review_request_id: None,
1721                    last_reviewed_at_ms: None,
1722                    runs_since_review: 0,
1723                    expires_at_ms: None,
1724                    expired_at_ms: None,
1725                    retired_at_ms: None,
1726                    retire_reason: None,
1727                    paused_for_lifecycle: false,
1728                    health_last_checked_at_ms: None,
1729                    health_findings: Vec::new(),
1730                });
1731            record.retired_at_ms = Some(now);
1732            record.retire_reason = Some(reason.clone());
1733            record.paused_for_lifecycle = true;
1734            record.review_required = false;
1735            record.review_kind = Some(AutomationLifecycleReviewKind::Retired);
1736            record.review_requested_at_ms = Some(now);
1737            record.updated_at_ms = now;
1738            guard.updated_at_ms = now;
1739        }
1740        self.persist_automation_governance().await?;
1741        let _ = append_protected_audit_event(
1742            self,
1743            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.retired"),
1744            &tandem_types::TenantContext::local_implicit(),
1745            actor.actor_id.clone().or_else(|| actor.source.clone()),
1746            json!({
1747                "automationID": automation_id,
1748                "reason": reason,
1749                "actor": actor,
1750            }),
1751        )
1752        .await;
1753        Ok(Some(stored))
1754    }
1755
1756    pub async fn extend_automation_v2_retirement(
1757        &self,
1758        automation_id: &str,
1759        actor: GovernanceActorRef,
1760        expires_at_ms: Option<u64>,
1761        reason: Option<String>,
1762    ) -> anyhow::Result<Option<crate::AutomationV2Spec>> {
1763        let Some(mut automation) = self.get_automation_v2(automation_id).await else {
1764            return Ok(None);
1765        };
1766        let now = now_ms();
1767        let default_expires_after_ms = self
1768            .automation_governance
1769            .read()
1770            .await
1771            .limits
1772            .default_expires_after_ms;
1773        let next_expires_at_ms =
1774            expires_at_ms.unwrap_or_else(|| now.saturating_add(default_expires_after_ms.max(1)));
1775        automation.status = crate::AutomationV2Status::Active;
1776        let stored = self.put_automation_v2(automation).await?;
1777        {
1778            let mut guard = self.automation_governance.write().await;
1779            let record = guard
1780                .records
1781                .entry(automation_id.to_string())
1782                .or_insert_with(|| AutomationGovernanceRecord {
1783                    automation_id: automation_id.to_string(),
1784                    provenance: default_human_provenance(
1785                        Some(stored.creator_id.clone()),
1786                        "extend_default",
1787                    ),
1788                    declared_capabilities: declared_capabilities_for_automation(&stored),
1789                    modify_grants: Vec::new(),
1790                    capability_grants: Vec::new(),
1791                    created_at_ms: stored.created_at_ms,
1792                    updated_at_ms: now,
1793                    deleted_at_ms: None,
1794                    delete_retention_until_ms: None,
1795                    published_externally: false,
1796                    creation_paused: false,
1797                    review_required: false,
1798                    review_kind: None,
1799                    review_requested_at_ms: None,
1800                    review_request_id: None,
1801                    last_reviewed_at_ms: None,
1802                    runs_since_review: 0,
1803                    expires_at_ms: None,
1804                    expired_at_ms: None,
1805                    retired_at_ms: None,
1806                    retire_reason: None,
1807                    paused_for_lifecycle: false,
1808                    health_last_checked_at_ms: None,
1809                    health_findings: Vec::new(),
1810                });
1811            record.expires_at_ms = Some(next_expires_at_ms);
1812            record.expired_at_ms = None;
1813            record.retired_at_ms = None;
1814            record.retire_reason = None;
1815            record.paused_for_lifecycle = false;
1816            record.review_required = false;
1817            record.review_kind = None;
1818            record.review_requested_at_ms = None;
1819            record.review_request_id = None;
1820            record.last_reviewed_at_ms = Some(now);
1821            record.health_last_checked_at_ms = Some(now);
1822            record.updated_at_ms = now;
1823            guard.updated_at_ms = now;
1824        }
1825        self.persist_automation_governance().await?;
1826        let _ = append_protected_audit_event(
1827            self,
1828            format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.retirement.extended"),
1829            &tandem_types::TenantContext::local_implicit(),
1830            actor.actor_id.clone().or_else(|| actor.source.clone()),
1831            json!({
1832                "automationID": automation_id,
1833                "expiresAtMs": next_expires_at_ms,
1834                "reason": reason,
1835                "actor": actor,
1836            }),
1837        )
1838        .await;
1839        Ok(Some(stored))
1840    }
1841
1842    pub async fn record_automation_v2_spend(
1843        &self,
1844        run_id: &str,
1845        prompt_tokens: u64,
1846        completion_tokens: u64,
1847        total_tokens: u64,
1848        delta_cost_usd: f64,
1849    ) -> anyhow::Result<()> {
1850        let Some(run_snapshot) = self.get_automation_v2_run(run_id).await else {
1851            return Ok(());
1852        };
1853        let automation = if let Some(snapshot) = run_snapshot.automation_snapshot.clone() {
1854            snapshot
1855        } else {
1856            let Some(automation) = self.get_automation_v2(&run_snapshot.automation_id).await else {
1857                return Ok(());
1858            };
1859            automation
1860        };
1861        let governance = self
1862            .get_or_bootstrap_automation_governance(&automation)
1863            .await;
1864        let agent_ids = governance.agent_lineage_ids();
1865        if agent_ids.is_empty() {
1866            return Ok(());
1867        }
1868
1869        let now = now_ms();
1870        let (weekly_cap, warning_threshold_ratio) = {
1871            let guard = self.automation_governance.read().await;
1872            (
1873                guard.limits.weekly_spend_cap_usd,
1874                guard.limits.spend_warning_threshold_ratio,
1875            )
1876        };
1877
1878        let mut warning_events: Vec<(String, f64, f64)> = Vec::new();
1879        let mut hard_stop_agents: Vec<(String, f64, f64)> = Vec::new();
1880        {
1881            let mut guard = self.automation_governance.write().await;
1882            for agent_id in &agent_ids {
1883                let has_override = guard.has_approved_agent_quota_override(agent_id);
1884                let mut hard_stop_entry: Option<(String, f64, f64)> = None;
1885                let summary = guard
1886                    .agent_spend
1887                    .entry(agent_id.clone())
1888                    .or_insert_with(|| AgentSpendSummary::new(agent_id.clone(), now));
1889                summary.apply_usage(
1890                    now,
1891                    Some(&automation.automation_id),
1892                    Some(run_id),
1893                    prompt_tokens,
1894                    completion_tokens,
1895                    total_tokens,
1896                    delta_cost_usd,
1897                );
1898                if let Some(limit) = weekly_cap {
1899                    if summary.weekly_warning_threshold_reached(limit, warning_threshold_ratio)
1900                        && summary.weekly.soft_warning_at_ms.is_none()
1901                    {
1902                        summary.weekly.soft_warning_at_ms = Some(now);
1903                        warning_events.push((agent_id.clone(), summary.weekly.cost_usd, limit));
1904                    }
1905                    if summary.weekly_limit_reached(limit)
1906                        && summary.weekly.hard_stop_at_ms.is_none()
1907                        && !has_override
1908                    {
1909                        summary.weekly.hard_stop_at_ms = Some(now);
1910                        summary.paused_at_ms = Some(now);
1911                        summary.pause_reason =
1912                            Some(format!("weekly spend cap {:.2} USD reached", limit));
1913                        hard_stop_entry = Some((agent_id.clone(), summary.weekly.cost_usd, limit));
1914                    }
1915                }
1916                if let Some((agent_id, cost_usd, limit_usd)) = hard_stop_entry {
1917                    if !guard
1918                        .spend_paused_agents
1919                        .iter()
1920                        .any(|value| value == &agent_id)
1921                    {
1922                        guard.spend_paused_agents.push(agent_id.clone());
1923                    }
1924                    hard_stop_agents.push((agent_id, cost_usd, limit_usd));
1925                }
1926            }
1927            guard.updated_at_ms = now;
1928        }
1929        self.persist_automation_governance().await?;
1930
1931        for (agent_id, cost_usd, limit_usd) in warning_events {
1932            let _ = append_protected_audit_event(
1933                self,
1934                format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.spend.warning"),
1935                &tandem_types::TenantContext::local_implicit(),
1936                governance
1937                    .provenance
1938                    .creator
1939                    .actor_id
1940                    .clone()
1941                    .or_else(|| Some(automation.creator_id.clone())),
1942                json!({
1943                    "automationID": automation.automation_id,
1944                    "runID": run_id,
1945                    "agentID": agent_id,
1946                    "weeklyCostUsd": cost_usd,
1947                    "weeklySpendCapUsd": limit_usd,
1948                }),
1949            )
1950            .await;
1951        }
1952
1953        let mut requested_approvals = Vec::new();
1954        for (agent_id, cost_usd, limit_usd) in &hard_stop_agents {
1955            let guard = self.automation_governance.read().await;
1956            let has_override = guard.has_pending_agent_quota_override(agent_id)
1957                || guard.has_approved_agent_quota_override(agent_id);
1958            drop(guard);
1959            if has_override {
1960                continue;
1961            }
1962            if let Ok(approval) = self
1963                .request_approval(
1964                    GovernanceApprovalRequestType::QuotaOverride,
1965                    GovernanceActorRef::system("automation_spend_cap"),
1966                    GovernanceResourceRef {
1967                        resource_type: "agent".to_string(),
1968                        id: agent_id.clone(),
1969                    },
1970                    format!(
1971                        "Approve temporary quota override after agent {agent_id} reached weekly spend cap"
1972                    ),
1973                    json!({
1974                        "automationID": automation.automation_id,
1975                        "runID": run_id,
1976                        "agentID": agent_id,
1977                        "weeklyCostUsd": cost_usd,
1978                        "weeklySpendCapUsd": limit_usd,
1979                        "reason": "agent weekly spend cap exceeded",
1980                    }),
1981                    None,
1982                )
1983                .await
1984            {
1985                requested_approvals.push(approval.approval_id);
1986            }
1987        }
1988
1989        if !hard_stop_agents.is_empty() {
1990            let session_ids = run_snapshot.active_session_ids.clone();
1991            for session_id in &session_ids {
1992                let _ = self.cancellations.cancel(session_id).await;
1993            }
1994            self.forget_automation_v2_sessions(&session_ids).await;
1995            let instance_ids = run_snapshot.active_instance_ids.clone();
1996            for instance_id in instance_ids {
1997                let _ = self
1998                    .agent_teams
1999                    .cancel_instance(self, &instance_id, "paused by spend guardrail")
2000                    .await;
2001            }
2002            let paused_agent_labels = hard_stop_agents
2003                .iter()
2004                .map(|(agent_id, cost_usd, limit_usd)| {
2005                    format!("{agent_id} ({cost_usd:.4}/{limit_usd:.4} USD)")
2006                })
2007                .collect::<Vec<_>>()
2008                .join(", ");
2009            let detail = format!("weekly spend cap exceeded for {paused_agent_labels}");
2010            let _ = self
2011                .update_automation_v2_run(run_id, |row| {
2012                    row.status = crate::AutomationRunStatus::Paused;
2013                    row.detail = Some(detail.clone());
2014                    row.pause_reason = Some(detail.clone());
2015                    row.stop_kind = Some(crate::AutomationStopKind::GuardrailStopped);
2016                    row.stop_reason = Some(detail.clone());
2017                    row.active_session_ids.clear();
2018                    row.latest_session_id = None;
2019                    row.active_instance_ids.clear();
2020                    crate::app::state::automation::lifecycle::record_automation_lifecycle_event(
2021                        row,
2022                        "run_paused_spend_cap_exceeded",
2023                        Some(detail.clone()),
2024                        Some(crate::AutomationStopKind::GuardrailStopped),
2025                    );
2026                })
2027                .await;
2028            let _ = append_protected_audit_event(
2029                self,
2030                format!("{GOVERNANCE_AUDIT_EVENT_PREFIX}.spend.paused"),
2031                &tandem_types::TenantContext::local_implicit(),
2032                governance
2033                    .provenance
2034                    .creator
2035                    .actor_id
2036                    .clone()
2037                    .or_else(|| Some(automation.creator_id.clone())),
2038                json!({
2039                    "automationID": automation.automation_id,
2040                    "runID": run_id,
2041                    "pausedAgents": hard_stop_agents
2042                        .iter()
2043                        .map(|(agent_id, _, _)| agent_id)
2044                        .cloned()
2045                        .collect::<Vec<_>>(),
2046                    "requestedApprovals": requested_approvals,
2047                    "detail": detail,
2048                }),
2049            )
2050            .await;
2051        }
2052
2053        Ok(())
2054    }
2055}