Skip to main content

tandem_server/
optimization.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use sha2::{Digest, Sha256};
4use std::path::{Path, PathBuf};
5
6use crate::{AutomationV2RunRecord, AutomationV2Spec};
7
8#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "snake_case")]
10pub enum OptimizationTargetKind {
11    WorkflowV2PromptObjectiveOptimization,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
15#[serde(rename_all = "snake_case")]
16pub enum OptimizationCampaignStatus {
17    Draft,
18    Running,
19    AwaitingPromotionApproval,
20    PausedManual,
21    PausedBudget,
22    PausedEvaluatorUnstable,
23    Completed,
24    Failed,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
28#[serde(rename_all = "snake_case")]
29pub enum OptimizationExperimentStatus {
30    Draft,
31    Completed,
32    PromotionRecommended,
33    PromotionApproved,
34    PromotionRejected,
35    Discarded,
36    Failed,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize, Default)]
40pub struct OptimizationArtifactRefs {
41    pub objective_ref: String,
42    pub eval_ref: String,
43    pub mutation_policy_ref: String,
44    pub scope_ref: String,
45    pub budget_ref: String,
46    #[serde(default, skip_serializing_if = "Option::is_none")]
47    pub research_log_ref: Option<String>,
48    #[serde(default, skip_serializing_if = "Option::is_none")]
49    pub summary_ref: Option<String>,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct OptimizationFrozenArtifact {
54    pub artifact_ref: String,
55    pub resolved_path: String,
56    pub sha256: String,
57    pub size_bytes: u64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct OptimizationFrozenArtifacts {
62    pub objective: OptimizationFrozenArtifact,
63    pub eval: OptimizationFrozenArtifact,
64    pub mutation_policy: OptimizationFrozenArtifact,
65    pub scope: OptimizationFrozenArtifact,
66    pub budget: OptimizationFrozenArtifact,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
70pub struct OptimizationExecutionOverride {
71    pub provider_id: String,
72    pub model_id: String,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
76#[serde(rename_all = "snake_case")]
77pub enum OptimizationMetricKind {
78    ArtifactValidatorPassRate,
79    UnmetRequirementCount,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
83#[serde(rename_all = "snake_case")]
84pub enum OptimizationGuardrailKind {
85    BlockedNodeRate,
86    BudgetCeilings,
87}
88
89#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
90#[serde(rename_all = "snake_case")]
91pub enum OptimizationMutableField {
92    Objective,
93    OutputContractSummaryGuidance,
94    TimeoutMs,
95    RetryPolicyMaxAttempts,
96    RetryPolicyRetries,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OptimizationEvalSpec {
101    pub pack_ref: String,
102    pub primary_metric: OptimizationMetricKind,
103    pub secondary_metric: OptimizationMetricKind,
104    #[serde(default)]
105    pub hard_guardrails: Vec<OptimizationGuardrailKind>,
106    pub campaign_start_baseline_runs: u32,
107    pub baseline_replay_every_candidates: u32,
108    pub baseline_replay_every_minutes: u32,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct OptimizationMutationPolicy {
113    pub max_nodes_changed_per_candidate: u32,
114    pub max_field_families_changed_per_candidate: u32,
115    #[serde(default)]
116    pub allowed_text_fields: Vec<OptimizationMutableField>,
117    #[serde(default)]
118    pub allowed_knob_fields: Vec<OptimizationMutableField>,
119    pub max_text_delta_chars: u32,
120    pub max_text_delta_ratio: f64,
121    pub timeout_delta_percent: f64,
122    pub timeout_delta_ms: u64,
123    pub timeout_min_ms: u64,
124    pub timeout_max_ms: u64,
125    pub retry_delta: i32,
126    pub retry_min: i32,
127    pub retry_max: i32,
128    #[serde(default)]
129    pub allow_text_and_knob_bundle: bool,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct OptimizationSafetyScope {
134    pub candidate_snapshot_only: bool,
135    pub allow_live_source_mutation: bool,
136    pub allow_external_side_effects_in_eval: bool,
137    pub promotion_requires_operator_approval: bool,
138    #[serde(default)]
139    pub forbidden_fields: Vec<String>,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct OptimizationBudgetPolicy {
144    pub max_experiments: u32,
145    pub max_runtime_minutes: u32,
146    pub max_consecutive_failures: u32,
147    #[serde(default, skip_serializing_if = "Option::is_none")]
148    pub max_total_tokens: Option<u64>,
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub max_total_cost_usd: Option<f64>,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct OptimizationPhase1Config {
155    pub objective_markdown: String,
156    pub eval: OptimizationEvalSpec,
157    pub mutation_policy: OptimizationMutationPolicy,
158    pub scope: OptimizationSafetyScope,
159    pub budget: OptimizationBudgetPolicy,
160}
161
162#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
163pub struct OptimizationPhase1Metrics {
164    pub artifact_validator_pass_rate: f64,
165    pub unmet_requirement_count: f64,
166    pub blocked_node_rate: f64,
167    pub budget_within_limits: bool,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
171pub struct OptimizationBaselineReplayRecord {
172    pub replay_id: String,
173    #[serde(default, skip_serializing_if = "Option::is_none")]
174    pub automation_run_id: Option<String>,
175    pub phase1_metrics: OptimizationPhase1Metrics,
176    #[serde(default)]
177    pub validator_case_outcomes: std::collections::BTreeMap<String, String>,
178    #[serde(default)]
179    pub experiment_count_at_recording: u64,
180    pub recorded_at_ms: u64,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(rename_all = "snake_case")]
185pub enum OptimizationPromotionDecisionKind {
186    Promote,
187    Discard,
188    NeedsOperatorReview,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
192pub struct OptimizationPromotionDecision {
193    pub decision: OptimizationPromotionDecisionKind,
194    pub reason: String,
195}
196
197#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
198pub struct OptimizationValidatedMutation {
199    pub node_id: String,
200    pub field: OptimizationMutableField,
201    pub summary: String,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct OptimizationCampaignRecord {
206    pub optimization_id: String,
207    pub name: String,
208    pub target_kind: OptimizationTargetKind,
209    pub status: OptimizationCampaignStatus,
210    pub source_workflow_id: String,
211    pub source_workflow_name: String,
212    pub source_workflow_snapshot: AutomationV2Spec,
213    pub source_workflow_snapshot_hash: String,
214    pub baseline_snapshot: AutomationV2Spec,
215    pub baseline_snapshot_hash: String,
216    #[serde(default, skip_serializing_if = "Option::is_none")]
217    pub execution_override: Option<OptimizationExecutionOverride>,
218    pub artifacts: OptimizationArtifactRefs,
219    pub frozen_artifacts: OptimizationFrozenArtifacts,
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub phase1: Option<OptimizationPhase1Config>,
222    #[serde(default, skip_serializing_if = "Option::is_none")]
223    pub baseline_metrics: Option<OptimizationPhase1Metrics>,
224    #[serde(default)]
225    pub baseline_replays: Vec<OptimizationBaselineReplayRecord>,
226    #[serde(default)]
227    pub pending_baseline_run_ids: Vec<String>,
228    #[serde(default, skip_serializing_if = "Option::is_none")]
229    pub pending_promotion_experiment_id: Option<String>,
230    #[serde(default, skip_serializing_if = "Option::is_none")]
231    pub last_pause_reason: Option<String>,
232    pub created_at_ms: u64,
233    pub updated_at_ms: u64,
234    #[serde(default, skip_serializing_if = "Option::is_none")]
235    pub metadata: Option<Value>,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct OptimizationExperimentRecord {
240    pub experiment_id: String,
241    pub optimization_id: String,
242    pub status: OptimizationExperimentStatus,
243    pub candidate_snapshot: AutomationV2Spec,
244    pub candidate_snapshot_hash: String,
245    pub baseline_snapshot_hash: String,
246    #[serde(default, skip_serializing_if = "Option::is_none")]
247    pub mutation_summary: Option<String>,
248    #[serde(default, skip_serializing_if = "Option::is_none")]
249    pub metrics: Option<Value>,
250    #[serde(default, skip_serializing_if = "Option::is_none")]
251    pub phase1_metrics: Option<OptimizationPhase1Metrics>,
252    #[serde(default, skip_serializing_if = "Option::is_none")]
253    pub promotion_recommendation: Option<String>,
254    #[serde(default, skip_serializing_if = "Option::is_none")]
255    pub promotion_decision: Option<OptimizationPromotionDecision>,
256    pub created_at_ms: u64,
257    pub updated_at_ms: u64,
258    #[serde(default, skip_serializing_if = "Option::is_none")]
259    pub metadata: Option<Value>,
260}
261
262pub fn optimization_snapshot_hash(snapshot: &AutomationV2Spec) -> String {
263    let canonical = serde_json::to_vec(snapshot).unwrap_or_default();
264    let mut hasher = Sha256::new();
265    hasher.update(canonical);
266    format!("{:x}", hasher.finalize())
267}
268
269pub fn apply_optimization_execution_override(
270    workflow: &AutomationV2Spec,
271    execution_override: &OptimizationExecutionOverride,
272) -> AutomationV2Spec {
273    let mut snapshot = workflow.clone();
274    for agent in &mut snapshot.agents {
275        let mut policy = agent
276            .model_policy
277            .clone()
278            .and_then(|value| value.as_object().cloned())
279            .unwrap_or_default();
280        let fixed_model = serde_json::json!({
281            "provider_id": execution_override.provider_id,
282            "model_id": execution_override.model_id,
283        });
284        policy.insert("default_model".to_string(), fixed_model.clone());
285        if let Some(role_models) = policy.get_mut("role_models").and_then(Value::as_object_mut) {
286            for role_model in role_models.values_mut() {
287                *role_model = fixed_model.clone();
288            }
289        }
290        agent.model_policy = Some(Value::Object(policy));
291    }
292    snapshot
293}
294
295pub fn freeze_optimization_artifact(
296    workspace_root: &str,
297    artifact_ref: &str,
298) -> Result<OptimizationFrozenArtifact, String> {
299    let trimmed = artifact_ref.trim();
300    if trimmed.is_empty() {
301        return Err("artifact ref is required".to_string());
302    }
303    let workspace = PathBuf::from(workspace_root);
304    let candidate = PathBuf::from(trimmed);
305    let resolved = if candidate.is_absolute() {
306        candidate
307    } else {
308        workspace.join(candidate)
309    };
310    if !tandem_core::is_within_workspace_root(&resolved, &workspace) {
311        return Err(format!(
312            "artifact `{trimmed}` must stay within workspace root `{workspace_root}`"
313        ));
314    }
315    let metadata = std::fs::metadata(&resolved)
316        .map_err(|_| format!("artifact `{trimmed}` does not exist or is unreadable"))?;
317    if !metadata.is_file() {
318        return Err(format!("artifact `{trimmed}` must be a file"));
319    }
320    let bytes =
321        std::fs::read(&resolved).map_err(|_| format!("artifact `{trimmed}` could not be read"))?;
322    let mut hasher = Sha256::new();
323    hasher.update(&bytes);
324    let resolved_path = resolved
325        .canonicalize()
326        .unwrap_or_else(|_| Path::new(&resolved).to_path_buf());
327    Ok(OptimizationFrozenArtifact {
328        artifact_ref: trimmed.to_string(),
329        resolved_path: resolved_path.to_string_lossy().to_string(),
330        sha256: format!("{:x}", hasher.finalize()),
331        size_bytes: metadata.len(),
332    })
333}
334
335fn read_optimization_artifact_text(
336    artifact: &OptimizationFrozenArtifact,
337    label: &str,
338) -> Result<String, String> {
339    std::fs::read_to_string(&artifact.resolved_path).map_err(|_| {
340        format!(
341            "{label} artifact `{}` could not be read as UTF-8 text",
342            artifact.artifact_ref
343        )
344    })
345}
346
347fn parse_yaml_artifact<T: for<'de> Deserialize<'de>>(
348    artifact: &OptimizationFrozenArtifact,
349    label: &str,
350) -> Result<T, String> {
351    let raw = read_optimization_artifact_text(artifact, label)?;
352    serde_yaml::from_str::<T>(&raw).map_err(|error| {
353        format!(
354            "failed to parse {label} artifact `{}`: {error}",
355            artifact.artifact_ref
356        )
357    })
358}
359
360fn validate_phase1_eval_spec(eval: &OptimizationEvalSpec) -> Result<(), String> {
361    if eval.pack_ref.trim().is_empty() {
362        return Err("eval pack_ref is required".to_string());
363    }
364    if eval.primary_metric != OptimizationMetricKind::ArtifactValidatorPassRate {
365        return Err("phase 1 eval.primary_metric must be artifact_validator_pass_rate".to_string());
366    }
367    if eval.secondary_metric != OptimizationMetricKind::UnmetRequirementCount {
368        return Err("phase 1 eval.secondary_metric must be unmet_requirement_count".to_string());
369    }
370    if eval.hard_guardrails.len() != 2
371        || !eval
372            .hard_guardrails
373            .contains(&OptimizationGuardrailKind::BlockedNodeRate)
374        || !eval
375            .hard_guardrails
376            .contains(&OptimizationGuardrailKind::BudgetCeilings)
377    {
378        return Err(
379            "phase 1 eval.hard_guardrails must be exactly blocked_node_rate and budget_ceilings"
380                .to_string(),
381        );
382    }
383    if eval.campaign_start_baseline_runs != 2 {
384        return Err("phase 1 campaign_start_baseline_runs must be 2".to_string());
385    }
386    if eval.baseline_replay_every_candidates != 5 {
387        return Err("phase 1 baseline_replay_every_candidates must be 5".to_string());
388    }
389    if eval.baseline_replay_every_minutes != 30 {
390        return Err("phase 1 baseline_replay_every_minutes must be 30".to_string());
391    }
392    Ok(())
393}
394
395fn validate_phase1_mutation_policy(policy: &OptimizationMutationPolicy) -> Result<(), String> {
396    if policy.max_nodes_changed_per_candidate != 1 {
397        return Err("phase 1 max_nodes_changed_per_candidate must be 1".to_string());
398    }
399    if policy.max_field_families_changed_per_candidate != 1 {
400        return Err("phase 1 max_field_families_changed_per_candidate must be 1".to_string());
401    }
402    if policy.allowed_text_fields.is_empty() && policy.allowed_knob_fields.is_empty() {
403        return Err("phase 1 mutation policy must allow at least one mutable field".to_string());
404    }
405    if policy.allowed_text_fields.iter().any(|field| {
406        !matches!(
407            field,
408            OptimizationMutableField::Objective
409                | OptimizationMutableField::OutputContractSummaryGuidance
410        )
411    }) {
412        return Err(
413            "phase 1 allowed_text_fields may only include objective or output_contract_summary_guidance"
414                .to_string(),
415        );
416    }
417    if policy.allowed_knob_fields.iter().any(|field| {
418        !matches!(
419            field,
420            OptimizationMutableField::TimeoutMs
421                | OptimizationMutableField::RetryPolicyMaxAttempts
422                | OptimizationMutableField::RetryPolicyRetries
423        )
424    }) {
425        return Err(
426            "phase 1 allowed_knob_fields may only include timeout_ms, retry_policy_max_attempts, or retry_policy_retries"
427                .to_string(),
428        );
429    }
430    if policy.max_text_delta_chars == 0 || policy.max_text_delta_chars > 300 {
431        return Err("phase 1 max_text_delta_chars must be between 1 and 300".to_string());
432    }
433    if !(0.0 < policy.max_text_delta_ratio && policy.max_text_delta_ratio <= 0.25) {
434        return Err("phase 1 max_text_delta_ratio must be > 0 and <= 0.25".to_string());
435    }
436    if !(0.0 < policy.timeout_delta_percent && policy.timeout_delta_percent <= 0.15) {
437        return Err("phase 1 timeout_delta_percent must be > 0 and <= 0.15".to_string());
438    }
439    if policy.timeout_delta_ms == 0 || policy.timeout_delta_ms > 30_000 {
440        return Err("phase 1 timeout_delta_ms must be between 1 and 30000".to_string());
441    }
442    if policy.timeout_min_ms < 30_000
443        || policy.timeout_max_ms > 600_000
444        || policy.timeout_min_ms >= policy.timeout_max_ms
445    {
446        return Err(
447            "phase 1 timeout bounds must stay within 30000..600000 ms and min < max".to_string(),
448        );
449    }
450    if policy.retry_delta <= 0 || policy.retry_delta > 1 {
451        return Err("phase 1 retry_delta must be 1".to_string());
452    }
453    if policy.retry_min < 0 || policy.retry_max > 3 || policy.retry_min > policy.retry_max {
454        return Err("phase 1 retry bounds must stay within 0..3".to_string());
455    }
456    if policy.allow_text_and_knob_bundle {
457        return Err("phase 1 may not bundle text and knob mutations".to_string());
458    }
459    Ok(())
460}
461
462fn validate_phase1_scope(scope: &OptimizationSafetyScope) -> Result<(), String> {
463    if !scope.candidate_snapshot_only {
464        return Err("phase 1 scope must require candidate_snapshot_only".to_string());
465    }
466    if scope.allow_live_source_mutation {
467        return Err("phase 1 scope must forbid live source mutation".to_string());
468    }
469    if scope.allow_external_side_effects_in_eval {
470        return Err("phase 1 scope must forbid external side effects in eval".to_string());
471    }
472    if !scope.promotion_requires_operator_approval {
473        return Err("phase 1 scope must require operator approval for promotion".to_string());
474    }
475    Ok(())
476}
477
478fn validate_phase1_budget(budget: &OptimizationBudgetPolicy) -> Result<(), String> {
479    if budget.max_experiments == 0 || budget.max_experiments > 100 {
480        return Err("phase 1 budget.max_experiments must be between 1 and 100".to_string());
481    }
482    if budget.max_runtime_minutes == 0 || budget.max_runtime_minutes > 1_440 {
483        return Err("phase 1 budget.max_runtime_minutes must be between 1 and 1440".to_string());
484    }
485    if budget.max_consecutive_failures == 0 || budget.max_consecutive_failures > 10 {
486        return Err("phase 1 budget.max_consecutive_failures must be between 1 and 10".to_string());
487    }
488    if let Some(max_cost_usd) = budget.max_total_cost_usd {
489        if !max_cost_usd.is_finite() || max_cost_usd <= 0.0 {
490            return Err("phase 1 budget.max_total_cost_usd must be positive".to_string());
491        }
492    }
493    if let Some(max_total_tokens) = budget.max_total_tokens {
494        if max_total_tokens == 0 {
495            return Err("phase 1 budget.max_total_tokens must be positive".to_string());
496        }
497    }
498    Ok(())
499}
500
501pub fn load_optimization_phase1_config(
502    frozen_artifacts: &OptimizationFrozenArtifacts,
503) -> Result<OptimizationPhase1Config, String> {
504    let objective_markdown =
505        read_optimization_artifact_text(&frozen_artifacts.objective, "objective")?;
506    if objective_markdown.trim().is_empty() {
507        return Err("objective artifact must not be empty".to_string());
508    }
509    let eval = parse_yaml_artifact::<OptimizationEvalSpec>(&frozen_artifacts.eval, "eval")?;
510    let mutation_policy = parse_yaml_artifact::<OptimizationMutationPolicy>(
511        &frozen_artifacts.mutation_policy,
512        "mutation policy",
513    )?;
514    let scope = parse_yaml_artifact::<OptimizationSafetyScope>(&frozen_artifacts.scope, "scope")?;
515    let budget =
516        parse_yaml_artifact::<OptimizationBudgetPolicy>(&frozen_artifacts.budget, "budget")?;
517    validate_phase1_eval_spec(&eval)?;
518    validate_phase1_mutation_policy(&mutation_policy)?;
519    validate_phase1_scope(&scope)?;
520    validate_phase1_budget(&budget)?;
521    Ok(OptimizationPhase1Config {
522        objective_markdown: objective_markdown.trim().to_string(),
523        eval,
524        mutation_policy,
525        scope,
526        budget,
527    })
528}
529
530fn workflow_has_simple_retry_field(
531    workflow: &AutomationV2Spec,
532    field: OptimizationMutableField,
533) -> bool {
534    workflow.flow.nodes.iter().any(|node| {
535        node.retry_policy
536            .as_ref()
537            .and_then(Value::as_object)
538            .and_then(|obj| match field {
539                OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
540                OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
541                _ => None,
542            })
543            .and_then(Value::as_i64)
544            .is_some()
545    })
546}
547
548pub fn validate_phase1_workflow_target(
549    workflow: &AutomationV2Spec,
550    phase1: &OptimizationPhase1Config,
551) -> Result<(), String> {
552    if workflow.flow.nodes.is_empty() {
553        return Err("phase 1 workflow target must contain at least one node".to_string());
554    }
555    if !workflow.flow.nodes.iter().any(|node| {
556        node.output_contract
557            .as_ref()
558            .and_then(|contract| contract.validator)
559            .is_some()
560    }) {
561        return Err(
562            "phase 1 workflow target must contain at least one validator-backed output contract"
563                .to_string(),
564        );
565    }
566    let has_objective_field = phase1
567        .mutation_policy
568        .allowed_text_fields
569        .contains(&OptimizationMutableField::Objective);
570    let has_summary_guidance_field = phase1
571        .mutation_policy
572        .allowed_text_fields
573        .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
574        && workflow.flow.nodes.iter().any(|node| {
575            node.output_contract
576                .as_ref()
577                .and_then(|contract| contract.summary_guidance.as_ref())
578                .is_some()
579        });
580    let has_timeout_field = phase1
581        .mutation_policy
582        .allowed_knob_fields
583        .contains(&OptimizationMutableField::TimeoutMs)
584        && workflow
585            .flow
586            .nodes
587            .iter()
588            .any(|node| node.timeout_ms.is_some());
589    let has_retry_field = phase1
590        .mutation_policy
591        .allowed_knob_fields
592        .iter()
593        .copied()
594        .filter(|field| {
595            matches!(
596                field,
597                OptimizationMutableField::RetryPolicyMaxAttempts
598                    | OptimizationMutableField::RetryPolicyRetries
599            )
600        })
601        .any(|field| workflow_has_simple_retry_field(workflow, field));
602    if !(has_objective_field || has_summary_guidance_field || has_timeout_field || has_retry_field)
603    {
604        return Err(
605            "phase 1 workflow target does not expose any mutable fields allowed by the mutation policy"
606                .to_string(),
607        );
608    }
609    Ok(())
610}
611
612fn mutable_field_label(field: OptimizationMutableField) -> &'static str {
613    match field {
614        OptimizationMutableField::Objective => "objective",
615        OptimizationMutableField::OutputContractSummaryGuidance => {
616            "output_contract.summary_guidance"
617        }
618        OptimizationMutableField::TimeoutMs => "timeout_ms",
619        OptimizationMutableField::RetryPolicyMaxAttempts => "retry_policy.max_attempts",
620        OptimizationMutableField::RetryPolicyRetries => "retry_policy.retries",
621    }
622}
623
624fn json_value<T: Serialize>(value: &T) -> Value {
625    serde_json::to_value(value).unwrap_or(Value::Null)
626}
627
628fn normalized_workflow_without_flow(snapshot: &AutomationV2Spec) -> Value {
629    let mut value = json_value(snapshot);
630    if let Some(obj) = value.as_object_mut() {
631        obj.remove("flow");
632    }
633    value
634}
635
636fn normalized_node_static_fields(node: &crate::AutomationFlowNode) -> Value {
637    let mut value = json_value(node);
638    if let Some(obj) = value.as_object_mut() {
639        obj.remove("objective");
640        obj.remove("output_contract");
641        obj.remove("retry_policy");
642        obj.remove("timeout_ms");
643    }
644    value
645}
646
647fn normalized_output_contract(contract: &Option<crate::AutomationFlowOutputContract>) -> Value {
648    let mut value = json_value(contract);
649    if let Some(obj) = value.as_object_mut() {
650        obj.remove("summary_guidance");
651    }
652    value
653}
654
655fn normalized_retry_policy(policy: &Option<Value>) -> Value {
656    let mut value = policy.clone().unwrap_or(Value::Null);
657    if let Some(obj) = value.as_object_mut() {
658        obj.remove("max_attempts");
659        obj.remove("retries");
660    }
661    value
662}
663
664fn retry_field_value(policy: &Option<Value>, field: OptimizationMutableField) -> Option<i64> {
665    policy
666        .as_ref()
667        .and_then(Value::as_object)
668        .and_then(|obj| match field {
669            OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
670            OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
671            _ => None,
672        })
673        .and_then(Value::as_i64)
674}
675
676fn text_delta_chars(before: &str, after: &str) -> usize {
677    let before_chars = before.chars().collect::<Vec<_>>();
678    let after_chars = after.chars().collect::<Vec<_>>();
679    let mut prefix = 0usize;
680    while prefix < before_chars.len()
681        && prefix < after_chars.len()
682        && before_chars[prefix] == after_chars[prefix]
683    {
684        prefix += 1;
685    }
686    let mut before_end = before_chars.len();
687    let mut after_end = after_chars.len();
688    while before_end > prefix
689        && after_end > prefix
690        && before_chars[before_end - 1] == after_chars[after_end - 1]
691    {
692        before_end -= 1;
693        after_end -= 1;
694    }
695    (before_end - prefix) + (after_end - prefix)
696}
697
698fn validate_text_mutation(
699    node_id: &str,
700    field: OptimizationMutableField,
701    before: &str,
702    after: &str,
703    policy: &OptimizationMutationPolicy,
704) -> Result<OptimizationValidatedMutation, String> {
705    if after.trim().is_empty() {
706        return Err(format!(
707            "node `{node_id}` {} must not become empty",
708            mutable_field_label(field)
709        ));
710    }
711    let delta_chars = text_delta_chars(before, after);
712    let baseline_len = before.chars().count().max(1);
713    let delta_ratio = delta_chars as f64 / baseline_len as f64;
714    if delta_chars == 0 {
715        return Err(format!(
716            "node `{node_id}` {} must change",
717            mutable_field_label(field)
718        ));
719    }
720    if delta_chars > policy.max_text_delta_chars as usize {
721        return Err(format!(
722            "node `{node_id}` {} exceeds phase 1 max_text_delta_chars",
723            mutable_field_label(field)
724        ));
725    }
726    if delta_ratio > policy.max_text_delta_ratio {
727        return Err(format!(
728            "node `{node_id}` {} exceeds phase 1 max_text_delta_ratio",
729            mutable_field_label(field)
730        ));
731    }
732    Ok(OptimizationValidatedMutation {
733        node_id: node_id.to_string(),
734        field,
735        summary: format!(
736            "mutate node `{node_id}` {} delta_chars={} delta_ratio={delta_ratio:.3}",
737            mutable_field_label(field),
738            delta_chars
739        ),
740    })
741}
742
743fn validate_timeout_mutation(
744    node_id: &str,
745    before: Option<u64>,
746    after: Option<u64>,
747    policy: &OptimizationMutationPolicy,
748) -> Result<OptimizationValidatedMutation, String> {
749    let before = before.ok_or_else(|| {
750        format!("node `{node_id}` timeout_ms is not mutable in phase 1 because it is absent")
751    })?;
752    let after = after
753        .ok_or_else(|| format!("node `{node_id}` timeout_ms may not be removed in phase 1"))?;
754    if after < policy.timeout_min_ms || after > policy.timeout_max_ms {
755        return Err(format!(
756            "node `{node_id}` timeout_ms must stay within {}..={} ms",
757            policy.timeout_min_ms, policy.timeout_max_ms
758        ));
759    }
760    let delta = after.abs_diff(before);
761    let allowed_percent_delta = ((before as f64) * policy.timeout_delta_percent).ceil() as u64;
762    if delta == 0 {
763        return Err(format!("node `{node_id}` timeout_ms must change"));
764    }
765    if delta > policy.timeout_delta_ms || delta > allowed_percent_delta {
766        return Err(format!(
767            "node `{node_id}` timeout_ms exceeds phase 1 timeout delta limits"
768        ));
769    }
770    Ok(OptimizationValidatedMutation {
771        node_id: node_id.to_string(),
772        field: OptimizationMutableField::TimeoutMs,
773        summary: format!(
774            "mutate node `{node_id}` timeout_ms from {before} to {after} (delta={delta})"
775        ),
776    })
777}
778
779fn validate_retry_mutation(
780    node_id: &str,
781    field: OptimizationMutableField,
782    before: Option<i64>,
783    after: Option<i64>,
784    policy: &OptimizationMutationPolicy,
785) -> Result<OptimizationValidatedMutation, String> {
786    let before = before.ok_or_else(|| {
787        format!(
788            "node `{node_id}` {} is not mutable in phase 1 because it is absent or non-integer",
789            mutable_field_label(field)
790        )
791    })?;
792    let after = after.ok_or_else(|| {
793        format!(
794            "node `{node_id}` {} may not be removed in phase 1",
795            mutable_field_label(field)
796        )
797    })?;
798    if after < policy.retry_min as i64 || after > policy.retry_max as i64 {
799        return Err(format!(
800            "node `{node_id}` {} must stay within {}..={}",
801            mutable_field_label(field),
802            policy.retry_min,
803            policy.retry_max
804        ));
805    }
806    let delta = (after - before).abs();
807    if delta == 0 {
808        return Err(format!(
809            "node `{node_id}` {} must change",
810            mutable_field_label(field)
811        ));
812    }
813    if delta > policy.retry_delta.abs() as i64 {
814        return Err(format!(
815            "node `{node_id}` {} exceeds phase 1 retry delta limit",
816            mutable_field_label(field)
817        ));
818    }
819    Ok(OptimizationValidatedMutation {
820        node_id: node_id.to_string(),
821        field,
822        summary: format!(
823            "mutate node `{node_id}` {} from {before} to {after}",
824            mutable_field_label(field)
825        ),
826    })
827}
828
829pub fn validate_phase1_candidate_mutation(
830    baseline: &AutomationV2Spec,
831    candidate: &AutomationV2Spec,
832    phase1: &OptimizationPhase1Config,
833) -> Result<OptimizationValidatedMutation, String> {
834    if normalized_workflow_without_flow(baseline) != normalized_workflow_without_flow(candidate) {
835        return Err(
836            "phase 1 candidate may not mutate workflow fields outside flow.nodes".to_string(),
837        );
838    }
839    if baseline.flow.nodes.len() != candidate.flow.nodes.len() {
840        return Err("phase 1 candidate may not add or remove workflow nodes".to_string());
841    }
842    let mut changes = Vec::new();
843    for (baseline_node, candidate_node) in
844        baseline.flow.nodes.iter().zip(candidate.flow.nodes.iter())
845    {
846        if baseline_node.node_id != candidate_node.node_id {
847            return Err("phase 1 candidate may not reorder or replace workflow nodes".to_string());
848        }
849        if normalized_node_static_fields(baseline_node)
850            != normalized_node_static_fields(candidate_node)
851        {
852            return Err(format!(
853                "phase 1 candidate may not mutate node `{}` outside the allowed field families",
854                baseline_node.node_id
855            ));
856        }
857        if normalized_output_contract(&baseline_node.output_contract)
858            != normalized_output_contract(&candidate_node.output_contract)
859        {
860            return Err(format!(
861                "phase 1 candidate may not mutate node `{}` output_contract outside summary_guidance",
862                baseline_node.node_id
863            ));
864        }
865        if normalized_retry_policy(&baseline_node.retry_policy)
866            != normalized_retry_policy(&candidate_node.retry_policy)
867        {
868            return Err(format!(
869                "phase 1 candidate may not mutate node `{}` retry_policy outside max_attempts/retries",
870                baseline_node.node_id
871            ));
872        }
873        if baseline_node.objective != candidate_node.objective {
874            if !phase1
875                .mutation_policy
876                .allowed_text_fields
877                .contains(&OptimizationMutableField::Objective)
878            {
879                return Err(format!(
880                    "node `{}` objective is not allowed by the phase 1 mutation policy",
881                    baseline_node.node_id
882                ));
883            }
884            changes.push(validate_text_mutation(
885                &baseline_node.node_id,
886                OptimizationMutableField::Objective,
887                &baseline_node.objective,
888                &candidate_node.objective,
889                &phase1.mutation_policy,
890            )?);
891        }
892        let baseline_summary = baseline_node
893            .output_contract
894            .as_ref()
895            .and_then(|contract| contract.summary_guidance.as_deref());
896        let candidate_summary = candidate_node
897            .output_contract
898            .as_ref()
899            .and_then(|contract| contract.summary_guidance.as_deref());
900        if baseline_summary != candidate_summary {
901            if !phase1
902                .mutation_policy
903                .allowed_text_fields
904                .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
905            {
906                return Err(format!(
907                    "node `{}` output_contract.summary_guidance is not allowed by the phase 1 mutation policy",
908                    baseline_node.node_id
909                ));
910            }
911            let before = baseline_summary.ok_or_else(|| {
912                format!(
913                    "node `{}` output_contract.summary_guidance may not be created in phase 1",
914                    baseline_node.node_id
915                )
916            })?;
917            let after = candidate_summary.ok_or_else(|| {
918                format!(
919                    "node `{}` output_contract.summary_guidance may not be removed in phase 1",
920                    baseline_node.node_id
921                )
922            })?;
923            changes.push(validate_text_mutation(
924                &baseline_node.node_id,
925                OptimizationMutableField::OutputContractSummaryGuidance,
926                before,
927                after,
928                &phase1.mutation_policy,
929            )?);
930        }
931        if baseline_node.timeout_ms != candidate_node.timeout_ms {
932            if !phase1
933                .mutation_policy
934                .allowed_knob_fields
935                .contains(&OptimizationMutableField::TimeoutMs)
936            {
937                return Err(format!(
938                    "node `{}` timeout_ms is not allowed by the phase 1 mutation policy",
939                    baseline_node.node_id
940                ));
941            }
942            changes.push(validate_timeout_mutation(
943                &baseline_node.node_id,
944                baseline_node.timeout_ms,
945                candidate_node.timeout_ms,
946                &phase1.mutation_policy,
947            )?);
948        }
949        for field in [
950            OptimizationMutableField::RetryPolicyMaxAttempts,
951            OptimizationMutableField::RetryPolicyRetries,
952        ] {
953            let before = retry_field_value(&baseline_node.retry_policy, field);
954            let after = retry_field_value(&candidate_node.retry_policy, field);
955            if before != after {
956                if !phase1.mutation_policy.allowed_knob_fields.contains(&field) {
957                    return Err(format!(
958                        "node `{}` {} is not allowed by the phase 1 mutation policy",
959                        baseline_node.node_id,
960                        mutable_field_label(field)
961                    ));
962                }
963                changes.push(validate_retry_mutation(
964                    &baseline_node.node_id,
965                    field,
966                    before,
967                    after,
968                    &phase1.mutation_policy,
969                )?);
970            }
971        }
972    }
973    if changes.is_empty() {
974        return Err("phase 1 candidate must change exactly one allowed field family".to_string());
975    }
976    let changed_nodes = changes
977        .iter()
978        .map(|change| change.node_id.as_str())
979        .collect::<std::collections::BTreeSet<_>>();
980    if changed_nodes.len() > phase1.mutation_policy.max_nodes_changed_per_candidate as usize {
981        return Err("phase 1 candidate may only change one node per experiment".to_string());
982    }
983    if changes.len()
984        > phase1
985            .mutation_policy
986            .max_field_families_changed_per_candidate as usize
987    {
988        return Err(
989            "phase 1 candidate may only change one field family per experiment".to_string(),
990        );
991    }
992    Ok(changes.into_iter().next().expect("non-empty change set"))
993}
994
995fn metric_f64(metrics: &Value, key: &str) -> Option<f64> {
996    metrics.get(key).and_then(Value::as_f64)
997}
998
999pub fn parse_phase1_metrics(metrics: &Value) -> Result<OptimizationPhase1Metrics, String> {
1000    let artifact_validator_pass_rate = metric_f64(metrics, "artifact_validator_pass_rate")
1001        .or_else(|| metric_f64(metrics, "validator_pass_rate"))
1002        .ok_or_else(|| "phase 1 metrics require artifact_validator_pass_rate".to_string())?;
1003    let unmet_requirement_count = metric_f64(metrics, "unmet_requirement_count")
1004        .ok_or_else(|| "phase 1 metrics require unmet_requirement_count".to_string())?;
1005    let blocked_node_rate = metric_f64(metrics, "blocked_node_rate")
1006        .ok_or_else(|| "phase 1 metrics require blocked_node_rate".to_string())?;
1007    let budget_within_limits = metrics
1008        .get("budget_within_limits")
1009        .and_then(Value::as_bool)
1010        .ok_or_else(|| "phase 1 metrics require budget_within_limits".to_string())?;
1011    Ok(OptimizationPhase1Metrics {
1012        artifact_validator_pass_rate,
1013        unmet_requirement_count,
1014        blocked_node_rate,
1015        budget_within_limits,
1016    })
1017}
1018
1019pub fn derive_phase1_metrics_from_run(
1020    run: &AutomationV2RunRecord,
1021    baseline_snapshot: &AutomationV2Spec,
1022    phase1: &OptimizationPhase1Config,
1023) -> Result<OptimizationPhase1Metrics, String> {
1024    let total_nodes = baseline_snapshot.flow.nodes.len().max(1) as f64;
1025    let validator_outputs = run
1026        .checkpoint
1027        .node_outputs
1028        .values()
1029        .filter_map(Value::as_object)
1030        .filter_map(|row| row.get("validator_summary"))
1031        .filter_map(Value::as_object)
1032        .collect::<Vec<_>>();
1033    if validator_outputs.is_empty() {
1034        return Err("automation run does not contain validator-backed outputs".to_string());
1035    }
1036    let passed_count = validator_outputs
1037        .iter()
1038        .filter(|summary| {
1039            summary
1040                .get("outcome")
1041                .and_then(Value::as_str)
1042                .is_some_and(|value| value.eq_ignore_ascii_case("passed"))
1043        })
1044        .count() as f64;
1045    let unmet_requirement_count = validator_outputs
1046        .iter()
1047        .map(|summary| {
1048            summary
1049                .get("unmet_requirements")
1050                .and_then(Value::as_array)
1051                .map(|rows| rows.len() as f64)
1052                .unwrap_or(0.0)
1053        })
1054        .sum::<f64>();
1055    let runtime_ms = run
1056        .finished_at_ms
1057        .or(Some(run.updated_at_ms))
1058        .unwrap_or(run.updated_at_ms)
1059        .saturating_sub(run.started_at_ms.unwrap_or(run.created_at_ms));
1060    let within_tokens = phase1
1061        .budget
1062        .max_total_tokens
1063        .is_none_or(|limit| run.total_tokens <= limit);
1064    let within_cost = phase1
1065        .budget
1066        .max_total_cost_usd
1067        .is_none_or(|limit| run.estimated_cost_usd <= limit);
1068    let within_runtime =
1069        runtime_ms <= (phase1.budget.max_runtime_minutes as u64).saturating_mul(60_000);
1070    Ok(OptimizationPhase1Metrics {
1071        artifact_validator_pass_rate: passed_count / validator_outputs.len() as f64,
1072        unmet_requirement_count,
1073        blocked_node_rate: run.checkpoint.blocked_nodes.len() as f64 / total_nodes,
1074        budget_within_limits: within_tokens && within_cost && within_runtime,
1075    })
1076}
1077
1078pub fn derive_phase1_validator_case_outcomes_from_run(
1079    run: &AutomationV2RunRecord,
1080) -> std::collections::BTreeMap<String, String> {
1081    run.checkpoint
1082        .node_outputs
1083        .iter()
1084        .filter_map(|(node_id, output)| {
1085            let outcome = output
1086                .as_object()
1087                .and_then(|row| row.get("validator_summary"))
1088                .and_then(Value::as_object)
1089                .and_then(|summary| summary.get("outcome"))
1090                .and_then(Value::as_str)
1091                .map(str::trim)
1092                .filter(|value| !value.is_empty())?;
1093            Some((node_id.clone(), outcome.to_ascii_lowercase()))
1094        })
1095        .collect()
1096}
1097
1098pub fn evaluate_phase1_promotion(
1099    baseline: &OptimizationPhase1Metrics,
1100    candidate: &OptimizationPhase1Metrics,
1101) -> OptimizationPromotionDecision {
1102    if !candidate.budget_within_limits {
1103        return OptimizationPromotionDecision {
1104            decision: OptimizationPromotionDecisionKind::Discard,
1105            reason: "candidate exceeded phase 1 budget ceilings".to_string(),
1106        };
1107    }
1108    if candidate.blocked_node_rate > baseline.blocked_node_rate {
1109        return OptimizationPromotionDecision {
1110            decision: OptimizationPromotionDecisionKind::Discard,
1111            reason: "candidate increased blocked_node_rate".to_string(),
1112        };
1113    }
1114    if candidate.artifact_validator_pass_rate > baseline.artifact_validator_pass_rate {
1115        return OptimizationPromotionDecision {
1116            decision: OptimizationPromotionDecisionKind::Promote,
1117            reason: "candidate improved artifact_validator_pass_rate".to_string(),
1118        };
1119    }
1120    if (candidate.artifact_validator_pass_rate - baseline.artifact_validator_pass_rate).abs()
1121        <= f64::EPSILON
1122        && candidate.unmet_requirement_count < baseline.unmet_requirement_count
1123    {
1124        return OptimizationPromotionDecision {
1125            decision: OptimizationPromotionDecisionKind::Promote,
1126            reason: "candidate improved unmet_requirement_count on a primary-metric tie"
1127                .to_string(),
1128        };
1129    }
1130    OptimizationPromotionDecision {
1131        decision: OptimizationPromotionDecisionKind::Discard,
1132        reason: "candidate did not beat the current phase 1 baseline".to_string(),
1133    }
1134}
1135
1136pub fn establish_phase1_baseline(
1137    replays: &[OptimizationBaselineReplayRecord],
1138    phase1: &OptimizationPhase1Config,
1139) -> Result<OptimizationPhase1Metrics, String> {
1140    let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1141    if replays.len() < required_runs {
1142        return Err(format!(
1143            "phase 1 baseline establishment requires at least {required_runs} replay runs"
1144        ));
1145    }
1146    let relevant = &replays[replays.len() - required_runs..];
1147    if relevant
1148        .iter()
1149        .any(|replay| !replay.phase1_metrics.budget_within_limits)
1150    {
1151        return Err("phase 1 baseline replay exceeded budget ceilings".to_string());
1152    }
1153    let validator_min = relevant
1154        .iter()
1155        .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1156        .fold(f64::INFINITY, f64::min);
1157    let validator_max = relevant
1158        .iter()
1159        .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1160        .fold(f64::NEG_INFINITY, f64::max);
1161    if validator_max - validator_min > 0.05 {
1162        return Err(
1163            "phase 1 baseline replay drift exceeded 5 percentage points for artifact_validator_pass_rate"
1164                .to_string(),
1165        );
1166    }
1167    let blocked_min = relevant
1168        .iter()
1169        .map(|replay| replay.phase1_metrics.blocked_node_rate)
1170        .fold(f64::INFINITY, f64::min);
1171    let blocked_max = relevant
1172        .iter()
1173        .map(|replay| replay.phase1_metrics.blocked_node_rate)
1174        .fold(f64::NEG_INFINITY, f64::max);
1175    if blocked_max - blocked_min > 0.05 {
1176        return Err(
1177            "phase 1 baseline replay drift exceeded 5 percentage points for blocked_node_rate"
1178                .to_string(),
1179        );
1180    }
1181    let mut case_outcomes =
1182        std::collections::BTreeMap::<String, std::collections::BTreeSet<String>>::new();
1183    for replay in relevant {
1184        for (case_id, outcome) in &replay.validator_case_outcomes {
1185            case_outcomes
1186                .entry(case_id.clone())
1187                .or_default()
1188                .insert(outcome.clone());
1189        }
1190    }
1191    let total_cases = case_outcomes.len();
1192    if total_cases > 0 {
1193        let flaky_cases = case_outcomes
1194            .values()
1195            .filter(|outcomes| outcomes.len() > 1)
1196            .count();
1197        if (flaky_cases as f64 / total_cases as f64) > 0.10 {
1198            return Err(format!(
1199                "phase 1 baseline replay flakiness exceeded 10% of validator cases ({flaky_cases}/{total_cases})"
1200            ));
1201        }
1202    }
1203    let count = relevant.len() as f64;
1204    Ok(OptimizationPhase1Metrics {
1205        artifact_validator_pass_rate: relevant
1206            .iter()
1207            .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1208            .sum::<f64>()
1209            / count,
1210        unmet_requirement_count: relevant
1211            .iter()
1212            .map(|replay| replay.phase1_metrics.unmet_requirement_count)
1213            .sum::<f64>()
1214            / count,
1215        blocked_node_rate: relevant
1216            .iter()
1217            .map(|replay| replay.phase1_metrics.blocked_node_rate)
1218            .sum::<f64>()
1219            / count,
1220        budget_within_limits: true,
1221    })
1222}
1223
1224pub fn phase1_baseline_replay_due(
1225    replays: &[OptimizationBaselineReplayRecord],
1226    pending_run_count: usize,
1227    phase1: &OptimizationPhase1Config,
1228    experiment_count: usize,
1229    now_ms: u64,
1230) -> bool {
1231    if pending_run_count > 0 {
1232        return false;
1233    }
1234    let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1235    if replays.len() < required_runs {
1236        return true;
1237    }
1238    let Some(last_replay) = replays.last() else {
1239        return true;
1240    };
1241    let candidate_interval = phase1.eval.baseline_replay_every_candidates.max(1) as usize;
1242    let candidates_since_last =
1243        experiment_count.saturating_sub(last_replay.experiment_count_at_recording as usize);
1244    if candidates_since_last >= candidate_interval {
1245        return true;
1246    }
1247    let replay_interval_ms = (phase1.eval.baseline_replay_every_minutes.max(1) as u64) * 60_000;
1248    now_ms.saturating_sub(last_replay.recorded_at_ms) >= replay_interval_ms
1249}
1250
1251#[cfg(test)]
1252mod tests {
1253    use super::*;
1254    use serde_json::json;
1255
1256    fn sample_phase1() -> OptimizationPhase1Config {
1257        OptimizationPhase1Config {
1258            objective_markdown: "improve output quality".to_string(),
1259            eval: OptimizationEvalSpec {
1260                pack_ref: "eval-pack.jsonl".to_string(),
1261                primary_metric: OptimizationMetricKind::ArtifactValidatorPassRate,
1262                secondary_metric: OptimizationMetricKind::UnmetRequirementCount,
1263                hard_guardrails: vec![
1264                    OptimizationGuardrailKind::BlockedNodeRate,
1265                    OptimizationGuardrailKind::BudgetCeilings,
1266                ],
1267                campaign_start_baseline_runs: 2,
1268                baseline_replay_every_candidates: 5,
1269                baseline_replay_every_minutes: 30,
1270            },
1271            mutation_policy: OptimizationMutationPolicy {
1272                max_nodes_changed_per_candidate: 1,
1273                max_field_families_changed_per_candidate: 1,
1274                allowed_text_fields: vec![
1275                    OptimizationMutableField::Objective,
1276                    OptimizationMutableField::OutputContractSummaryGuidance,
1277                ],
1278                allowed_knob_fields: vec![
1279                    OptimizationMutableField::TimeoutMs,
1280                    OptimizationMutableField::RetryPolicyMaxAttempts,
1281                    OptimizationMutableField::RetryPolicyRetries,
1282                ],
1283                max_text_delta_chars: 300,
1284                max_text_delta_ratio: 0.25,
1285                timeout_delta_percent: 0.15,
1286                timeout_delta_ms: 30_000,
1287                timeout_min_ms: 30_000,
1288                timeout_max_ms: 600_000,
1289                retry_delta: 1,
1290                retry_min: 0,
1291                retry_max: 3,
1292                allow_text_and_knob_bundle: false,
1293            },
1294            scope: OptimizationSafetyScope {
1295                candidate_snapshot_only: true,
1296                allow_live_source_mutation: false,
1297                allow_external_side_effects_in_eval: false,
1298                promotion_requires_operator_approval: true,
1299                forbidden_fields: vec!["agents".to_string()],
1300            },
1301            budget: OptimizationBudgetPolicy {
1302                max_experiments: 10,
1303                max_runtime_minutes: 60,
1304                max_consecutive_failures: 3,
1305                max_total_tokens: Some(50_000),
1306                max_total_cost_usd: Some(10.0),
1307            },
1308        }
1309    }
1310
1311    fn sample_workflow() -> AutomationV2Spec {
1312        AutomationV2Spec {
1313            automation_id: "wf-opt".to_string(),
1314            name: "Optimization Target".to_string(),
1315            description: Some("workflow".to_string()),
1316            status: crate::AutomationV2Status::Draft,
1317            schedule: crate::AutomationV2Schedule {
1318                schedule_type: crate::AutomationV2ScheduleType::Manual,
1319                cron_expression: None,
1320                interval_seconds: None,
1321                timezone: "UTC".to_string(),
1322                misfire_policy: crate::RoutineMisfirePolicy::Skip,
1323            },
1324            agents: vec![crate::AutomationAgentProfile {
1325                agent_id: "agent-1".to_string(),
1326                template_id: None,
1327                display_name: "Worker".to_string(),
1328                avatar_url: None,
1329                model_policy: None,
1330                skills: Vec::new(),
1331                tool_policy: crate::AutomationAgentToolPolicy {
1332                    allowlist: Vec::new(),
1333                    denylist: Vec::new(),
1334                },
1335                mcp_policy: crate::AutomationAgentMcpPolicy {
1336                    allowed_servers: Vec::new(),
1337                    allowed_tools: None,
1338                },
1339                approval_policy: None,
1340            }],
1341            flow: crate::AutomationFlowSpec {
1342                nodes: vec![crate::AutomationFlowNode {
1343                    node_id: "node-1".to_string(),
1344                    agent_id: "agent-1".to_string(),
1345                    objective: "Write a concise report for the user".to_string(),
1346                    depends_on: Vec::new(),
1347                    input_refs: Vec::new(),
1348                    output_contract: Some(crate::AutomationFlowOutputContract {
1349                        kind: "report".to_string(),
1350                        validator: Some(crate::AutomationOutputValidatorKind::ResearchBrief),
1351                        enforcement: None,
1352                        schema: None,
1353                        summary_guidance: Some("Summarize clearly.".to_string()),
1354                    }),
1355                    retry_policy: Some(json!({ "max_attempts": 1, "retries": 0 })),
1356                    timeout_ms: Some(60_000),
1357                    stage_kind: None,
1358                    gate: None,
1359                    metadata: None,
1360                }],
1361            },
1362            execution: crate::AutomationExecutionPolicy {
1363                max_parallel_agents: None,
1364                max_total_runtime_ms: None,
1365                max_total_tool_calls: None,
1366                max_total_tokens: None,
1367                max_total_cost_usd: None,
1368            },
1369            output_targets: Vec::new(),
1370            created_at_ms: 1,
1371            updated_at_ms: 1,
1372            creator_id: "test".to_string(),
1373            workspace_root: Some("/tmp/workflow".to_string()),
1374            metadata: None,
1375            next_fire_at_ms: None,
1376            last_fired_at_ms: None,
1377        }
1378    }
1379
1380    #[test]
1381    fn validate_phase1_candidate_accepts_single_objective_change() {
1382        let phase1 = sample_phase1();
1383        let baseline = sample_workflow();
1384        let mut candidate = baseline.clone();
1385        candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1386        let mutation =
1387            validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect("valid");
1388        assert_eq!(mutation.node_id, "node-1");
1389        assert_eq!(mutation.field, OptimizationMutableField::Objective);
1390    }
1391
1392    #[test]
1393    fn validate_phase1_candidate_rejects_mutation_bundle() {
1394        let phase1 = sample_phase1();
1395        let baseline = sample_workflow();
1396        let mut candidate = baseline.clone();
1397        candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1398        candidate.flow.nodes[0].timeout_ms = Some(65_000);
1399        let error =
1400            validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect_err("bundle");
1401        assert!(error.contains("one field family"));
1402    }
1403
1404    #[test]
1405    fn validate_phase1_candidate_rejects_oversize_text_delta() {
1406        let phase1 = sample_phase1();
1407        let baseline = sample_workflow();
1408        let mut candidate = baseline.clone();
1409        candidate.flow.nodes[0].objective = "x".repeat(400);
1410        let error = validate_phase1_candidate_mutation(&baseline, &candidate, &phase1)
1411            .expect_err("oversize");
1412        assert!(error.contains("max_text_delta_chars") || error.contains("max_text_delta_ratio"));
1413    }
1414
1415    #[test]
1416    fn evaluate_phase1_promotion_prefers_primary_metric() {
1417        let baseline = OptimizationPhase1Metrics {
1418            artifact_validator_pass_rate: 0.7,
1419            unmet_requirement_count: 2.0,
1420            blocked_node_rate: 0.1,
1421            budget_within_limits: true,
1422        };
1423        let candidate = OptimizationPhase1Metrics {
1424            artifact_validator_pass_rate: 0.8,
1425            unmet_requirement_count: 3.0,
1426            blocked_node_rate: 0.1,
1427            budget_within_limits: true,
1428        };
1429        let decision = evaluate_phase1_promotion(&baseline, &candidate);
1430        assert_eq!(
1431            decision.decision,
1432            OptimizationPromotionDecisionKind::Promote
1433        );
1434    }
1435
1436    #[test]
1437    fn evaluate_phase1_promotion_uses_secondary_metric_on_tie() {
1438        let baseline = OptimizationPhase1Metrics {
1439            artifact_validator_pass_rate: 0.8,
1440            unmet_requirement_count: 2.0,
1441            blocked_node_rate: 0.1,
1442            budget_within_limits: true,
1443        };
1444        let candidate = OptimizationPhase1Metrics {
1445            artifact_validator_pass_rate: 0.8,
1446            unmet_requirement_count: 1.0,
1447            blocked_node_rate: 0.1,
1448            budget_within_limits: true,
1449        };
1450        let decision = evaluate_phase1_promotion(&baseline, &candidate);
1451        assert_eq!(
1452            decision.decision,
1453            OptimizationPromotionDecisionKind::Promote
1454        );
1455    }
1456
1457    #[test]
1458    fn evaluate_phase1_promotion_rejects_guardrail_regression() {
1459        let baseline = OptimizationPhase1Metrics {
1460            artifact_validator_pass_rate: 0.8,
1461            unmet_requirement_count: 2.0,
1462            blocked_node_rate: 0.1,
1463            budget_within_limits: true,
1464        };
1465        let candidate = OptimizationPhase1Metrics {
1466            artifact_validator_pass_rate: 0.9,
1467            unmet_requirement_count: 1.0,
1468            blocked_node_rate: 0.2,
1469            budget_within_limits: true,
1470        };
1471        let decision = evaluate_phase1_promotion(&baseline, &candidate);
1472        assert_eq!(
1473            decision.decision,
1474            OptimizationPromotionDecisionKind::Discard
1475        );
1476    }
1477
1478    #[test]
1479    fn establish_phase1_baseline_averages_stable_replays() {
1480        let phase1 = sample_phase1();
1481        let replays = vec![
1482            OptimizationBaselineReplayRecord {
1483                replay_id: "replay-1".to_string(),
1484                automation_run_id: None,
1485                phase1_metrics: OptimizationPhase1Metrics {
1486                    artifact_validator_pass_rate: 0.8,
1487                    unmet_requirement_count: 1.0,
1488                    blocked_node_rate: 0.0,
1489                    budget_within_limits: true,
1490                },
1491                validator_case_outcomes: std::collections::BTreeMap::from([(
1492                    "node-1".to_string(),
1493                    "passed".to_string(),
1494                )]),
1495                experiment_count_at_recording: 0,
1496                recorded_at_ms: 1,
1497            },
1498            OptimizationBaselineReplayRecord {
1499                replay_id: "replay-2".to_string(),
1500                automation_run_id: None,
1501                phase1_metrics: OptimizationPhase1Metrics {
1502                    artifact_validator_pass_rate: 0.84,
1503                    unmet_requirement_count: 2.0,
1504                    blocked_node_rate: 0.02,
1505                    budget_within_limits: true,
1506                },
1507                validator_case_outcomes: std::collections::BTreeMap::from([(
1508                    "node-1".to_string(),
1509                    "passed".to_string(),
1510                )]),
1511                experiment_count_at_recording: 0,
1512                recorded_at_ms: 2,
1513            },
1514        ];
1515        let baseline = establish_phase1_baseline(&replays, &phase1).expect("stable");
1516        assert!((baseline.artifact_validator_pass_rate - 0.82).abs() < 1e-9);
1517        assert!((baseline.unmet_requirement_count - 1.5).abs() < 1e-9);
1518        assert!((baseline.blocked_node_rate - 0.01).abs() < 1e-9);
1519    }
1520
1521    #[test]
1522    fn establish_phase1_baseline_rejects_validator_drift() {
1523        let phase1 = sample_phase1();
1524        let replays = vec![
1525            OptimizationBaselineReplayRecord {
1526                replay_id: "replay-1".to_string(),
1527                automation_run_id: None,
1528                phase1_metrics: OptimizationPhase1Metrics {
1529                    artifact_validator_pass_rate: 0.8,
1530                    unmet_requirement_count: 1.0,
1531                    blocked_node_rate: 0.0,
1532                    budget_within_limits: true,
1533                },
1534                validator_case_outcomes: std::collections::BTreeMap::from([(
1535                    "node-1".to_string(),
1536                    "passed".to_string(),
1537                )]),
1538                experiment_count_at_recording: 0,
1539                recorded_at_ms: 1,
1540            },
1541            OptimizationBaselineReplayRecord {
1542                replay_id: "replay-2".to_string(),
1543                automation_run_id: None,
1544                phase1_metrics: OptimizationPhase1Metrics {
1545                    artifact_validator_pass_rate: 0.9,
1546                    unmet_requirement_count: 1.0,
1547                    blocked_node_rate: 0.0,
1548                    budget_within_limits: true,
1549                },
1550                validator_case_outcomes: std::collections::BTreeMap::from([(
1551                    "node-1".to_string(),
1552                    "passed".to_string(),
1553                )]),
1554                experiment_count_at_recording: 0,
1555                recorded_at_ms: 2,
1556            },
1557        ];
1558        let error = establish_phase1_baseline(&replays, &phase1).expect_err("drift");
1559        assert!(error.contains("artifact_validator_pass_rate"));
1560    }
1561
1562    #[test]
1563    fn establish_phase1_baseline_rejects_flaky_validator_cases() {
1564        let phase1 = sample_phase1();
1565        let replays = vec![
1566            OptimizationBaselineReplayRecord {
1567                replay_id: "replay-1".to_string(),
1568                automation_run_id: None,
1569                phase1_metrics: OptimizationPhase1Metrics {
1570                    artifact_validator_pass_rate: 1.0,
1571                    unmet_requirement_count: 0.0,
1572                    blocked_node_rate: 0.0,
1573                    budget_within_limits: true,
1574                },
1575                validator_case_outcomes: std::collections::BTreeMap::from([
1576                    ("node-1".to_string(), "passed".to_string()),
1577                    ("node-2".to_string(), "passed".to_string()),
1578                    ("node-3".to_string(), "passed".to_string()),
1579                    ("node-4".to_string(), "passed".to_string()),
1580                    ("node-5".to_string(), "passed".to_string()),
1581                    ("node-6".to_string(), "passed".to_string()),
1582                    ("node-7".to_string(), "passed".to_string()),
1583                    ("node-8".to_string(), "passed".to_string()),
1584                ]),
1585                experiment_count_at_recording: 0,
1586                recorded_at_ms: 1,
1587            },
1588            OptimizationBaselineReplayRecord {
1589                replay_id: "replay-2".to_string(),
1590                automation_run_id: None,
1591                phase1_metrics: OptimizationPhase1Metrics {
1592                    artifact_validator_pass_rate: 1.0,
1593                    unmet_requirement_count: 0.0,
1594                    blocked_node_rate: 0.0,
1595                    budget_within_limits: true,
1596                },
1597                validator_case_outcomes: std::collections::BTreeMap::from([
1598                    ("node-1".to_string(), "blocked".to_string()),
1599                    ("node-2".to_string(), "blocked".to_string()),
1600                    ("node-3".to_string(), "passed".to_string()),
1601                    ("node-4".to_string(), "passed".to_string()),
1602                    ("node-5".to_string(), "passed".to_string()),
1603                    ("node-6".to_string(), "passed".to_string()),
1604                    ("node-7".to_string(), "passed".to_string()),
1605                    ("node-8".to_string(), "passed".to_string()),
1606                ]),
1607                experiment_count_at_recording: 0,
1608                recorded_at_ms: 2,
1609            },
1610        ];
1611        let error = establish_phase1_baseline(&replays, &phase1).expect_err("flaky");
1612        assert!(error.contains("flakiness"));
1613    }
1614
1615    #[test]
1616    fn derive_phase1_metrics_from_run_uses_validator_outputs_and_budget() {
1617        let phase1 = sample_phase1();
1618        let workflow = sample_workflow();
1619        let run = AutomationV2RunRecord {
1620            run_id: "run-1".to_string(),
1621            automation_id: workflow.automation_id.clone(),
1622            trigger_type: "manual".to_string(),
1623            status: crate::AutomationRunStatus::Completed,
1624            created_at_ms: 1,
1625            updated_at_ms: 10_000,
1626            started_at_ms: Some(1_000),
1627            finished_at_ms: Some(9_000),
1628            active_session_ids: Vec::new(),
1629            latest_session_id: None,
1630            active_instance_ids: Vec::new(),
1631            checkpoint: crate::AutomationRunCheckpoint {
1632                completed_nodes: vec!["node-1".to_string()],
1633                pending_nodes: Vec::new(),
1634                node_outputs: std::collections::HashMap::from([
1635                    (
1636                        "node-1".to_string(),
1637                        json!({
1638                            "validator_summary": {
1639                                "outcome": "passed",
1640                                "unmet_requirements": []
1641                            }
1642                        }),
1643                    ),
1644                    (
1645                        "node-2".to_string(),
1646                        json!({
1647                            "validator_summary": {
1648                                "outcome": "blocked",
1649                                "unmet_requirements": ["citation_missing", "web_sources_reviewed_missing"]
1650                            }
1651                        }),
1652                    ),
1653                ]),
1654                node_attempts: std::collections::HashMap::new(),
1655                blocked_nodes: vec!["node-2".to_string()],
1656                awaiting_gate: None,
1657                gate_history: Vec::new(),
1658                lifecycle_history: Vec::new(),
1659                last_failure: None,
1660            },
1661            automation_snapshot: Some(workflow.clone()),
1662            pause_reason: None,
1663            resume_reason: None,
1664            detail: None,
1665            stop_kind: None,
1666            stop_reason: None,
1667            prompt_tokens: 0,
1668            completion_tokens: 0,
1669            total_tokens: 100,
1670            estimated_cost_usd: 0.5,
1671        };
1672        let metrics = derive_phase1_metrics_from_run(&run, &workflow, &phase1).expect("metrics");
1673        assert!((metrics.artifact_validator_pass_rate - 0.5).abs() < 1e-9);
1674        assert!((metrics.unmet_requirement_count - 2.0).abs() < 1e-9);
1675        assert!((metrics.blocked_node_rate - 1.0).abs() < 1e-9);
1676        assert!(metrics.budget_within_limits);
1677        let case_outcomes = derive_phase1_validator_case_outcomes_from_run(&run);
1678        assert_eq!(
1679            case_outcomes.get("node-1").map(String::as_str),
1680            Some("passed")
1681        );
1682        assert_eq!(
1683            case_outcomes.get("node-2").map(String::as_str),
1684            Some("blocked")
1685        );
1686    }
1687
1688    #[test]
1689    fn phase1_baseline_replay_due_requires_initial_replays() {
1690        let phase1 = sample_phase1();
1691        assert!(phase1_baseline_replay_due(&[], 0, &phase1, 0, 0));
1692        assert!(!phase1_baseline_replay_due(&[], 1, &phase1, 0, 0));
1693    }
1694
1695    #[test]
1696    fn phase1_baseline_replay_due_uses_candidate_and_time_intervals() {
1697        let phase1 = sample_phase1();
1698        let replays = vec![
1699            OptimizationBaselineReplayRecord {
1700                replay_id: "replay-1".to_string(),
1701                automation_run_id: None,
1702                phase1_metrics: OptimizationPhase1Metrics {
1703                    artifact_validator_pass_rate: 1.0,
1704                    unmet_requirement_count: 0.0,
1705                    blocked_node_rate: 0.0,
1706                    budget_within_limits: true,
1707                },
1708                validator_case_outcomes: std::collections::BTreeMap::from([(
1709                    "node-1".to_string(),
1710                    "passed".to_string(),
1711                )]),
1712                experiment_count_at_recording: 2,
1713                recorded_at_ms: 1_000,
1714            },
1715            OptimizationBaselineReplayRecord {
1716                replay_id: "replay-2".to_string(),
1717                automation_run_id: None,
1718                phase1_metrics: OptimizationPhase1Metrics {
1719                    artifact_validator_pass_rate: 1.0,
1720                    unmet_requirement_count: 0.0,
1721                    blocked_node_rate: 0.0,
1722                    budget_within_limits: true,
1723                },
1724                validator_case_outcomes: std::collections::BTreeMap::from([(
1725                    "node-1".to_string(),
1726                    "passed".to_string(),
1727                )]),
1728                experiment_count_at_recording: 2,
1729                recorded_at_ms: 1_500,
1730            },
1731        ];
1732        assert!(phase1_baseline_replay_due(&replays, 0, &phase1, 7, 2_000));
1733        assert!(phase1_baseline_replay_due(
1734            &replays, 0, &phase1, 3, 1_801_500
1735        ));
1736        assert!(!phase1_baseline_replay_due(&replays, 0, &phase1, 3, 2_000));
1737    }
1738}