1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use sha2::{Digest, Sha256};
4use std::path::{Path, PathBuf};
5
6use crate::{AutomationV2RunRecord, AutomationV2Spec};
7
8#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "snake_case")]
10pub enum OptimizationTargetKind {
11 WorkflowV2PromptObjectiveOptimization,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
15#[serde(rename_all = "snake_case")]
16pub enum OptimizationCampaignStatus {
17 Draft,
18 Running,
19 AwaitingPromotionApproval,
20 PausedManual,
21 PausedBudget,
22 PausedEvaluatorUnstable,
23 Completed,
24 Failed,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
28#[serde(rename_all = "snake_case")]
29pub enum OptimizationExperimentStatus {
30 Draft,
31 Completed,
32 PromotionRecommended,
33 PromotionApproved,
34 PromotionRejected,
35 Discarded,
36 Failed,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize, Default)]
40pub struct OptimizationArtifactRefs {
41 pub objective_ref: String,
42 pub eval_ref: String,
43 pub mutation_policy_ref: String,
44 pub scope_ref: String,
45 pub budget_ref: String,
46 #[serde(default, skip_serializing_if = "Option::is_none")]
47 pub research_log_ref: Option<String>,
48 #[serde(default, skip_serializing_if = "Option::is_none")]
49 pub summary_ref: Option<String>,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct OptimizationFrozenArtifact {
54 pub artifact_ref: String,
55 pub resolved_path: String,
56 pub sha256: String,
57 pub size_bytes: u64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct OptimizationFrozenArtifacts {
62 pub objective: OptimizationFrozenArtifact,
63 pub eval: OptimizationFrozenArtifact,
64 pub mutation_policy: OptimizationFrozenArtifact,
65 pub scope: OptimizationFrozenArtifact,
66 pub budget: OptimizationFrozenArtifact,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
70pub struct OptimizationExecutionOverride {
71 pub provider_id: String,
72 pub model_id: String,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
76#[serde(rename_all = "snake_case")]
77pub enum OptimizationMetricKind {
78 ArtifactValidatorPassRate,
79 UnmetRequirementCount,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
83#[serde(rename_all = "snake_case")]
84pub enum OptimizationGuardrailKind {
85 BlockedNodeRate,
86 BudgetCeilings,
87}
88
89#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
90#[serde(rename_all = "snake_case")]
91pub enum OptimizationMutableField {
92 Objective,
93 OutputContractSummaryGuidance,
94 TimeoutMs,
95 RetryPolicyMaxAttempts,
96 RetryPolicyRetries,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OptimizationEvalSpec {
101 pub pack_ref: String,
102 pub primary_metric: OptimizationMetricKind,
103 pub secondary_metric: OptimizationMetricKind,
104 #[serde(default)]
105 pub hard_guardrails: Vec<OptimizationGuardrailKind>,
106 pub campaign_start_baseline_runs: u32,
107 pub baseline_replay_every_candidates: u32,
108 pub baseline_replay_every_minutes: u32,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct OptimizationMutationPolicy {
113 pub max_nodes_changed_per_candidate: u32,
114 pub max_field_families_changed_per_candidate: u32,
115 #[serde(default)]
116 pub allowed_text_fields: Vec<OptimizationMutableField>,
117 #[serde(default)]
118 pub allowed_knob_fields: Vec<OptimizationMutableField>,
119 pub max_text_delta_chars: u32,
120 pub max_text_delta_ratio: f64,
121 pub timeout_delta_percent: f64,
122 pub timeout_delta_ms: u64,
123 pub timeout_min_ms: u64,
124 pub timeout_max_ms: u64,
125 pub retry_delta: i32,
126 pub retry_min: i32,
127 pub retry_max: i32,
128 #[serde(default)]
129 pub allow_text_and_knob_bundle: bool,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct OptimizationSafetyScope {
134 pub candidate_snapshot_only: bool,
135 pub allow_live_source_mutation: bool,
136 pub allow_external_side_effects_in_eval: bool,
137 pub promotion_requires_operator_approval: bool,
138 #[serde(default)]
139 pub forbidden_fields: Vec<String>,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct OptimizationBudgetPolicy {
144 pub max_experiments: u32,
145 pub max_runtime_minutes: u32,
146 pub max_consecutive_failures: u32,
147 #[serde(default, skip_serializing_if = "Option::is_none")]
148 pub max_total_tokens: Option<u64>,
149 #[serde(default, skip_serializing_if = "Option::is_none")]
150 pub max_total_cost_usd: Option<f64>,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct OptimizationPhase1Config {
155 pub objective_markdown: String,
156 pub eval: OptimizationEvalSpec,
157 pub mutation_policy: OptimizationMutationPolicy,
158 pub scope: OptimizationSafetyScope,
159 pub budget: OptimizationBudgetPolicy,
160}
161
162#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
163pub struct OptimizationPhase1Metrics {
164 pub artifact_validator_pass_rate: f64,
165 pub unmet_requirement_count: f64,
166 pub blocked_node_rate: f64,
167 pub budget_within_limits: bool,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
171pub struct OptimizationBaselineReplayRecord {
172 pub replay_id: String,
173 #[serde(default, skip_serializing_if = "Option::is_none")]
174 pub automation_run_id: Option<String>,
175 pub phase1_metrics: OptimizationPhase1Metrics,
176 #[serde(default)]
177 pub validator_case_outcomes: std::collections::BTreeMap<String, String>,
178 #[serde(default)]
179 pub experiment_count_at_recording: u64,
180 pub recorded_at_ms: u64,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(rename_all = "snake_case")]
185pub enum OptimizationPromotionDecisionKind {
186 Promote,
187 Discard,
188 NeedsOperatorReview,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
192pub struct OptimizationPromotionDecision {
193 pub decision: OptimizationPromotionDecisionKind,
194 pub reason: String,
195}
196
197#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
198pub struct OptimizationValidatedMutation {
199 pub node_id: String,
200 pub field: OptimizationMutableField,
201 pub summary: String,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct OptimizationCampaignRecord {
206 pub optimization_id: String,
207 pub name: String,
208 pub target_kind: OptimizationTargetKind,
209 pub status: OptimizationCampaignStatus,
210 pub source_workflow_id: String,
211 pub source_workflow_name: String,
212 pub source_workflow_snapshot: AutomationV2Spec,
213 pub source_workflow_snapshot_hash: String,
214 pub baseline_snapshot: AutomationV2Spec,
215 pub baseline_snapshot_hash: String,
216 #[serde(default, skip_serializing_if = "Option::is_none")]
217 pub execution_override: Option<OptimizationExecutionOverride>,
218 pub artifacts: OptimizationArtifactRefs,
219 pub frozen_artifacts: OptimizationFrozenArtifacts,
220 #[serde(default, skip_serializing_if = "Option::is_none")]
221 pub phase1: Option<OptimizationPhase1Config>,
222 #[serde(default, skip_serializing_if = "Option::is_none")]
223 pub baseline_metrics: Option<OptimizationPhase1Metrics>,
224 #[serde(default)]
225 pub baseline_replays: Vec<OptimizationBaselineReplayRecord>,
226 #[serde(default)]
227 pub pending_baseline_run_ids: Vec<String>,
228 #[serde(default, skip_serializing_if = "Option::is_none")]
229 pub pending_promotion_experiment_id: Option<String>,
230 #[serde(default, skip_serializing_if = "Option::is_none")]
231 pub last_pause_reason: Option<String>,
232 pub created_at_ms: u64,
233 pub updated_at_ms: u64,
234 #[serde(default, skip_serializing_if = "Option::is_none")]
235 pub metadata: Option<Value>,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct OptimizationExperimentRecord {
240 pub experiment_id: String,
241 pub optimization_id: String,
242 pub status: OptimizationExperimentStatus,
243 pub candidate_snapshot: AutomationV2Spec,
244 pub candidate_snapshot_hash: String,
245 pub baseline_snapshot_hash: String,
246 #[serde(default, skip_serializing_if = "Option::is_none")]
247 pub mutation_summary: Option<String>,
248 #[serde(default, skip_serializing_if = "Option::is_none")]
249 pub metrics: Option<Value>,
250 #[serde(default, skip_serializing_if = "Option::is_none")]
251 pub phase1_metrics: Option<OptimizationPhase1Metrics>,
252 #[serde(default, skip_serializing_if = "Option::is_none")]
253 pub promotion_recommendation: Option<String>,
254 #[serde(default, skip_serializing_if = "Option::is_none")]
255 pub promotion_decision: Option<OptimizationPromotionDecision>,
256 pub created_at_ms: u64,
257 pub updated_at_ms: u64,
258 #[serde(default, skip_serializing_if = "Option::is_none")]
259 pub metadata: Option<Value>,
260}
261
262pub fn optimization_snapshot_hash(snapshot: &AutomationV2Spec) -> String {
263 let canonical = serde_json::to_vec(snapshot).unwrap_or_default();
264 let mut hasher = Sha256::new();
265 hasher.update(canonical);
266 format!("{:x}", hasher.finalize())
267}
268
269pub fn apply_optimization_execution_override(
270 workflow: &AutomationV2Spec,
271 execution_override: &OptimizationExecutionOverride,
272) -> AutomationV2Spec {
273 let mut snapshot = workflow.clone();
274 for agent in &mut snapshot.agents {
275 let mut policy = agent
276 .model_policy
277 .clone()
278 .and_then(|value| value.as_object().cloned())
279 .unwrap_or_default();
280 let fixed_model = serde_json::json!({
281 "provider_id": execution_override.provider_id,
282 "model_id": execution_override.model_id,
283 });
284 policy.insert("default_model".to_string(), fixed_model.clone());
285 if let Some(role_models) = policy.get_mut("role_models").and_then(Value::as_object_mut) {
286 for role_model in role_models.values_mut() {
287 *role_model = fixed_model.clone();
288 }
289 }
290 agent.model_policy = Some(Value::Object(policy));
291 }
292 snapshot
293}
294
295pub fn freeze_optimization_artifact(
296 workspace_root: &str,
297 artifact_ref: &str,
298) -> Result<OptimizationFrozenArtifact, String> {
299 let trimmed = artifact_ref.trim();
300 if trimmed.is_empty() {
301 return Err("artifact ref is required".to_string());
302 }
303 let workspace = PathBuf::from(workspace_root);
304 let candidate = PathBuf::from(trimmed);
305 let resolved = if candidate.is_absolute() {
306 candidate
307 } else {
308 workspace.join(candidate)
309 };
310 if !tandem_core::is_within_workspace_root(&resolved, &workspace) {
311 return Err(format!(
312 "artifact `{trimmed}` must stay within workspace root `{workspace_root}`"
313 ));
314 }
315 let metadata = std::fs::metadata(&resolved)
316 .map_err(|_| format!("artifact `{trimmed}` does not exist or is unreadable"))?;
317 if !metadata.is_file() {
318 return Err(format!("artifact `{trimmed}` must be a file"));
319 }
320 let bytes =
321 std::fs::read(&resolved).map_err(|_| format!("artifact `{trimmed}` could not be read"))?;
322 let mut hasher = Sha256::new();
323 hasher.update(&bytes);
324 let resolved_path = resolved
325 .canonicalize()
326 .unwrap_or_else(|_| Path::new(&resolved).to_path_buf());
327 Ok(OptimizationFrozenArtifact {
328 artifact_ref: trimmed.to_string(),
329 resolved_path: resolved_path.to_string_lossy().to_string(),
330 sha256: format!("{:x}", hasher.finalize()),
331 size_bytes: metadata.len(),
332 })
333}
334
335fn read_optimization_artifact_text(
336 artifact: &OptimizationFrozenArtifact,
337 label: &str,
338) -> Result<String, String> {
339 std::fs::read_to_string(&artifact.resolved_path).map_err(|_| {
340 format!(
341 "{label} artifact `{}` could not be read as UTF-8 text",
342 artifact.artifact_ref
343 )
344 })
345}
346
347fn parse_yaml_artifact<T: for<'de> Deserialize<'de>>(
348 artifact: &OptimizationFrozenArtifact,
349 label: &str,
350) -> Result<T, String> {
351 let raw = read_optimization_artifact_text(artifact, label)?;
352 serde_yaml::from_str::<T>(&raw).map_err(|error| {
353 format!(
354 "failed to parse {label} artifact `{}`: {error}",
355 artifact.artifact_ref
356 )
357 })
358}
359
360fn validate_phase1_eval_spec(eval: &OptimizationEvalSpec) -> Result<(), String> {
361 if eval.pack_ref.trim().is_empty() {
362 return Err("eval pack_ref is required".to_string());
363 }
364 if eval.primary_metric != OptimizationMetricKind::ArtifactValidatorPassRate {
365 return Err("phase 1 eval.primary_metric must be artifact_validator_pass_rate".to_string());
366 }
367 if eval.secondary_metric != OptimizationMetricKind::UnmetRequirementCount {
368 return Err("phase 1 eval.secondary_metric must be unmet_requirement_count".to_string());
369 }
370 if eval.hard_guardrails.len() != 2
371 || !eval
372 .hard_guardrails
373 .contains(&OptimizationGuardrailKind::BlockedNodeRate)
374 || !eval
375 .hard_guardrails
376 .contains(&OptimizationGuardrailKind::BudgetCeilings)
377 {
378 return Err(
379 "phase 1 eval.hard_guardrails must be exactly blocked_node_rate and budget_ceilings"
380 .to_string(),
381 );
382 }
383 if eval.campaign_start_baseline_runs != 2 {
384 return Err("phase 1 campaign_start_baseline_runs must be 2".to_string());
385 }
386 if eval.baseline_replay_every_candidates != 5 {
387 return Err("phase 1 baseline_replay_every_candidates must be 5".to_string());
388 }
389 if eval.baseline_replay_every_minutes != 30 {
390 return Err("phase 1 baseline_replay_every_minutes must be 30".to_string());
391 }
392 Ok(())
393}
394
395fn validate_phase1_mutation_policy(policy: &OptimizationMutationPolicy) -> Result<(), String> {
396 if policy.max_nodes_changed_per_candidate != 1 {
397 return Err("phase 1 max_nodes_changed_per_candidate must be 1".to_string());
398 }
399 if policy.max_field_families_changed_per_candidate != 1 {
400 return Err("phase 1 max_field_families_changed_per_candidate must be 1".to_string());
401 }
402 if policy.allowed_text_fields.is_empty() && policy.allowed_knob_fields.is_empty() {
403 return Err("phase 1 mutation policy must allow at least one mutable field".to_string());
404 }
405 if policy.allowed_text_fields.iter().any(|field| {
406 !matches!(
407 field,
408 OptimizationMutableField::Objective
409 | OptimizationMutableField::OutputContractSummaryGuidance
410 )
411 }) {
412 return Err(
413 "phase 1 allowed_text_fields may only include objective or output_contract_summary_guidance"
414 .to_string(),
415 );
416 }
417 if policy.allowed_knob_fields.iter().any(|field| {
418 !matches!(
419 field,
420 OptimizationMutableField::TimeoutMs
421 | OptimizationMutableField::RetryPolicyMaxAttempts
422 | OptimizationMutableField::RetryPolicyRetries
423 )
424 }) {
425 return Err(
426 "phase 1 allowed_knob_fields may only include timeout_ms, retry_policy_max_attempts, or retry_policy_retries"
427 .to_string(),
428 );
429 }
430 if policy.max_text_delta_chars == 0 || policy.max_text_delta_chars > 300 {
431 return Err("phase 1 max_text_delta_chars must be between 1 and 300".to_string());
432 }
433 if !(0.0 < policy.max_text_delta_ratio && policy.max_text_delta_ratio <= 0.25) {
434 return Err("phase 1 max_text_delta_ratio must be > 0 and <= 0.25".to_string());
435 }
436 if !(0.0 < policy.timeout_delta_percent && policy.timeout_delta_percent <= 0.15) {
437 return Err("phase 1 timeout_delta_percent must be > 0 and <= 0.15".to_string());
438 }
439 if policy.timeout_delta_ms == 0 || policy.timeout_delta_ms > 30_000 {
440 return Err("phase 1 timeout_delta_ms must be between 1 and 30000".to_string());
441 }
442 if policy.timeout_min_ms < 30_000
443 || policy.timeout_max_ms > 600_000
444 || policy.timeout_min_ms >= policy.timeout_max_ms
445 {
446 return Err(
447 "phase 1 timeout bounds must stay within 30000..600000 ms and min < max".to_string(),
448 );
449 }
450 if policy.retry_delta <= 0 || policy.retry_delta > 1 {
451 return Err("phase 1 retry_delta must be 1".to_string());
452 }
453 if policy.retry_min < 0 || policy.retry_max > 3 || policy.retry_min > policy.retry_max {
454 return Err("phase 1 retry bounds must stay within 0..3".to_string());
455 }
456 if policy.allow_text_and_knob_bundle {
457 return Err("phase 1 may not bundle text and knob mutations".to_string());
458 }
459 Ok(())
460}
461
462fn validate_phase1_scope(scope: &OptimizationSafetyScope) -> Result<(), String> {
463 if !scope.candidate_snapshot_only {
464 return Err("phase 1 scope must require candidate_snapshot_only".to_string());
465 }
466 if scope.allow_live_source_mutation {
467 return Err("phase 1 scope must forbid live source mutation".to_string());
468 }
469 if scope.allow_external_side_effects_in_eval {
470 return Err("phase 1 scope must forbid external side effects in eval".to_string());
471 }
472 if !scope.promotion_requires_operator_approval {
473 return Err("phase 1 scope must require operator approval for promotion".to_string());
474 }
475 Ok(())
476}
477
478fn validate_phase1_budget(budget: &OptimizationBudgetPolicy) -> Result<(), String> {
479 if budget.max_experiments == 0 || budget.max_experiments > 100 {
480 return Err("phase 1 budget.max_experiments must be between 1 and 100".to_string());
481 }
482 if budget.max_runtime_minutes == 0 || budget.max_runtime_minutes > 1_440 {
483 return Err("phase 1 budget.max_runtime_minutes must be between 1 and 1440".to_string());
484 }
485 if budget.max_consecutive_failures == 0 || budget.max_consecutive_failures > 10 {
486 return Err("phase 1 budget.max_consecutive_failures must be between 1 and 10".to_string());
487 }
488 if let Some(max_cost_usd) = budget.max_total_cost_usd {
489 if !max_cost_usd.is_finite() || max_cost_usd <= 0.0 {
490 return Err("phase 1 budget.max_total_cost_usd must be positive".to_string());
491 }
492 }
493 if let Some(max_total_tokens) = budget.max_total_tokens {
494 if max_total_tokens == 0 {
495 return Err("phase 1 budget.max_total_tokens must be positive".to_string());
496 }
497 }
498 Ok(())
499}
500
501pub fn load_optimization_phase1_config(
502 frozen_artifacts: &OptimizationFrozenArtifacts,
503) -> Result<OptimizationPhase1Config, String> {
504 let objective_markdown =
505 read_optimization_artifact_text(&frozen_artifacts.objective, "objective")?;
506 if objective_markdown.trim().is_empty() {
507 return Err("objective artifact must not be empty".to_string());
508 }
509 let eval = parse_yaml_artifact::<OptimizationEvalSpec>(&frozen_artifacts.eval, "eval")?;
510 let mutation_policy = parse_yaml_artifact::<OptimizationMutationPolicy>(
511 &frozen_artifacts.mutation_policy,
512 "mutation policy",
513 )?;
514 let scope = parse_yaml_artifact::<OptimizationSafetyScope>(&frozen_artifacts.scope, "scope")?;
515 let budget =
516 parse_yaml_artifact::<OptimizationBudgetPolicy>(&frozen_artifacts.budget, "budget")?;
517 validate_phase1_eval_spec(&eval)?;
518 validate_phase1_mutation_policy(&mutation_policy)?;
519 validate_phase1_scope(&scope)?;
520 validate_phase1_budget(&budget)?;
521 Ok(OptimizationPhase1Config {
522 objective_markdown: objective_markdown.trim().to_string(),
523 eval,
524 mutation_policy,
525 scope,
526 budget,
527 })
528}
529
530fn workflow_has_simple_retry_field(
531 workflow: &AutomationV2Spec,
532 field: OptimizationMutableField,
533) -> bool {
534 workflow.flow.nodes.iter().any(|node| {
535 node.retry_policy
536 .as_ref()
537 .and_then(Value::as_object)
538 .and_then(|obj| match field {
539 OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
540 OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
541 _ => None,
542 })
543 .and_then(Value::as_i64)
544 .is_some()
545 })
546}
547
548pub fn validate_phase1_workflow_target(
549 workflow: &AutomationV2Spec,
550 phase1: &OptimizationPhase1Config,
551) -> Result<(), String> {
552 if workflow.flow.nodes.is_empty() {
553 return Err("phase 1 workflow target must contain at least one node".to_string());
554 }
555 if !workflow.flow.nodes.iter().any(|node| {
556 node.output_contract
557 .as_ref()
558 .and_then(|contract| contract.validator)
559 .is_some()
560 }) {
561 return Err(
562 "phase 1 workflow target must contain at least one validator-backed output contract"
563 .to_string(),
564 );
565 }
566 let has_objective_field = phase1
567 .mutation_policy
568 .allowed_text_fields
569 .contains(&OptimizationMutableField::Objective);
570 let has_summary_guidance_field = phase1
571 .mutation_policy
572 .allowed_text_fields
573 .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
574 && workflow.flow.nodes.iter().any(|node| {
575 node.output_contract
576 .as_ref()
577 .and_then(|contract| contract.summary_guidance.as_ref())
578 .is_some()
579 });
580 let has_timeout_field = phase1
581 .mutation_policy
582 .allowed_knob_fields
583 .contains(&OptimizationMutableField::TimeoutMs)
584 && workflow
585 .flow
586 .nodes
587 .iter()
588 .any(|node| node.timeout_ms.is_some());
589 let has_retry_field = phase1
590 .mutation_policy
591 .allowed_knob_fields
592 .iter()
593 .copied()
594 .filter(|field| {
595 matches!(
596 field,
597 OptimizationMutableField::RetryPolicyMaxAttempts
598 | OptimizationMutableField::RetryPolicyRetries
599 )
600 })
601 .any(|field| workflow_has_simple_retry_field(workflow, field));
602 if !(has_objective_field || has_summary_guidance_field || has_timeout_field || has_retry_field)
603 {
604 return Err(
605 "phase 1 workflow target does not expose any mutable fields allowed by the mutation policy"
606 .to_string(),
607 );
608 }
609 Ok(())
610}
611
612fn mutable_field_label(field: OptimizationMutableField) -> &'static str {
613 match field {
614 OptimizationMutableField::Objective => "objective",
615 OptimizationMutableField::OutputContractSummaryGuidance => {
616 "output_contract.summary_guidance"
617 }
618 OptimizationMutableField::TimeoutMs => "timeout_ms",
619 OptimizationMutableField::RetryPolicyMaxAttempts => "retry_policy.max_attempts",
620 OptimizationMutableField::RetryPolicyRetries => "retry_policy.retries",
621 }
622}
623
624fn json_value<T: Serialize>(value: &T) -> Value {
625 serde_json::to_value(value).unwrap_or(Value::Null)
626}
627
628fn normalized_workflow_without_flow(snapshot: &AutomationV2Spec) -> Value {
629 let mut value = json_value(snapshot);
630 if let Some(obj) = value.as_object_mut() {
631 obj.remove("flow");
632 }
633 value
634}
635
636fn normalized_node_static_fields(node: &crate::AutomationFlowNode) -> Value {
637 let mut value = json_value(node);
638 if let Some(obj) = value.as_object_mut() {
639 obj.remove("objective");
640 obj.remove("output_contract");
641 obj.remove("retry_policy");
642 obj.remove("timeout_ms");
643 }
644 value
645}
646
647fn normalized_output_contract(contract: &Option<crate::AutomationFlowOutputContract>) -> Value {
648 let mut value = json_value(contract);
649 if let Some(obj) = value.as_object_mut() {
650 obj.remove("summary_guidance");
651 }
652 value
653}
654
655fn normalized_retry_policy(policy: &Option<Value>) -> Value {
656 let mut value = policy.clone().unwrap_or(Value::Null);
657 if let Some(obj) = value.as_object_mut() {
658 obj.remove("max_attempts");
659 obj.remove("retries");
660 }
661 value
662}
663
664fn retry_field_value(policy: &Option<Value>, field: OptimizationMutableField) -> Option<i64> {
665 policy
666 .as_ref()
667 .and_then(Value::as_object)
668 .and_then(|obj| match field {
669 OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
670 OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
671 _ => None,
672 })
673 .and_then(Value::as_i64)
674}
675
676fn text_delta_chars(before: &str, after: &str) -> usize {
677 let before_chars = before.chars().collect::<Vec<_>>();
678 let after_chars = after.chars().collect::<Vec<_>>();
679 let mut prefix = 0usize;
680 while prefix < before_chars.len()
681 && prefix < after_chars.len()
682 && before_chars[prefix] == after_chars[prefix]
683 {
684 prefix += 1;
685 }
686 let mut before_end = before_chars.len();
687 let mut after_end = after_chars.len();
688 while before_end > prefix
689 && after_end > prefix
690 && before_chars[before_end - 1] == after_chars[after_end - 1]
691 {
692 before_end -= 1;
693 after_end -= 1;
694 }
695 (before_end - prefix) + (after_end - prefix)
696}
697
698fn validate_text_mutation(
699 node_id: &str,
700 field: OptimizationMutableField,
701 before: &str,
702 after: &str,
703 policy: &OptimizationMutationPolicy,
704) -> Result<OptimizationValidatedMutation, String> {
705 if after.trim().is_empty() {
706 return Err(format!(
707 "node `{node_id}` {} must not become empty",
708 mutable_field_label(field)
709 ));
710 }
711 let delta_chars = text_delta_chars(before, after);
712 let baseline_len = before.chars().count().max(1);
713 let delta_ratio = delta_chars as f64 / baseline_len as f64;
714 if delta_chars == 0 {
715 return Err(format!(
716 "node `{node_id}` {} must change",
717 mutable_field_label(field)
718 ));
719 }
720 if delta_chars > policy.max_text_delta_chars as usize {
721 return Err(format!(
722 "node `{node_id}` {} exceeds phase 1 max_text_delta_chars",
723 mutable_field_label(field)
724 ));
725 }
726 if delta_ratio > policy.max_text_delta_ratio {
727 return Err(format!(
728 "node `{node_id}` {} exceeds phase 1 max_text_delta_ratio",
729 mutable_field_label(field)
730 ));
731 }
732 Ok(OptimizationValidatedMutation {
733 node_id: node_id.to_string(),
734 field,
735 summary: format!(
736 "mutate node `{node_id}` {} delta_chars={} delta_ratio={delta_ratio:.3}",
737 mutable_field_label(field),
738 delta_chars
739 ),
740 })
741}
742
743fn validate_timeout_mutation(
744 node_id: &str,
745 before: Option<u64>,
746 after: Option<u64>,
747 policy: &OptimizationMutationPolicy,
748) -> Result<OptimizationValidatedMutation, String> {
749 let before = before.ok_or_else(|| {
750 format!("node `{node_id}` timeout_ms is not mutable in phase 1 because it is absent")
751 })?;
752 let after = after
753 .ok_or_else(|| format!("node `{node_id}` timeout_ms may not be removed in phase 1"))?;
754 if after < policy.timeout_min_ms || after > policy.timeout_max_ms {
755 return Err(format!(
756 "node `{node_id}` timeout_ms must stay within {}..={} ms",
757 policy.timeout_min_ms, policy.timeout_max_ms
758 ));
759 }
760 let delta = after.abs_diff(before);
761 let allowed_percent_delta = ((before as f64) * policy.timeout_delta_percent).ceil() as u64;
762 if delta == 0 {
763 return Err(format!("node `{node_id}` timeout_ms must change"));
764 }
765 if delta > policy.timeout_delta_ms || delta > allowed_percent_delta {
766 return Err(format!(
767 "node `{node_id}` timeout_ms exceeds phase 1 timeout delta limits"
768 ));
769 }
770 Ok(OptimizationValidatedMutation {
771 node_id: node_id.to_string(),
772 field: OptimizationMutableField::TimeoutMs,
773 summary: format!(
774 "mutate node `{node_id}` timeout_ms from {before} to {after} (delta={delta})"
775 ),
776 })
777}
778
779fn validate_retry_mutation(
780 node_id: &str,
781 field: OptimizationMutableField,
782 before: Option<i64>,
783 after: Option<i64>,
784 policy: &OptimizationMutationPolicy,
785) -> Result<OptimizationValidatedMutation, String> {
786 let before = before.ok_or_else(|| {
787 format!(
788 "node `{node_id}` {} is not mutable in phase 1 because it is absent or non-integer",
789 mutable_field_label(field)
790 )
791 })?;
792 let after = after.ok_or_else(|| {
793 format!(
794 "node `{node_id}` {} may not be removed in phase 1",
795 mutable_field_label(field)
796 )
797 })?;
798 if after < policy.retry_min as i64 || after > policy.retry_max as i64 {
799 return Err(format!(
800 "node `{node_id}` {} must stay within {}..={}",
801 mutable_field_label(field),
802 policy.retry_min,
803 policy.retry_max
804 ));
805 }
806 let delta = (after - before).abs();
807 if delta == 0 {
808 return Err(format!(
809 "node `{node_id}` {} must change",
810 mutable_field_label(field)
811 ));
812 }
813 if delta > policy.retry_delta.abs() as i64 {
814 return Err(format!(
815 "node `{node_id}` {} exceeds phase 1 retry delta limit",
816 mutable_field_label(field)
817 ));
818 }
819 Ok(OptimizationValidatedMutation {
820 node_id: node_id.to_string(),
821 field,
822 summary: format!(
823 "mutate node `{node_id}` {} from {before} to {after}",
824 mutable_field_label(field)
825 ),
826 })
827}
828
829pub fn validate_phase1_candidate_mutation(
830 baseline: &AutomationV2Spec,
831 candidate: &AutomationV2Spec,
832 phase1: &OptimizationPhase1Config,
833) -> Result<OptimizationValidatedMutation, String> {
834 if normalized_workflow_without_flow(baseline) != normalized_workflow_without_flow(candidate) {
835 return Err(
836 "phase 1 candidate may not mutate workflow fields outside flow.nodes".to_string(),
837 );
838 }
839 if baseline.flow.nodes.len() != candidate.flow.nodes.len() {
840 return Err("phase 1 candidate may not add or remove workflow nodes".to_string());
841 }
842 let mut changes = Vec::new();
843 for (baseline_node, candidate_node) in
844 baseline.flow.nodes.iter().zip(candidate.flow.nodes.iter())
845 {
846 if baseline_node.node_id != candidate_node.node_id {
847 return Err("phase 1 candidate may not reorder or replace workflow nodes".to_string());
848 }
849 if normalized_node_static_fields(baseline_node)
850 != normalized_node_static_fields(candidate_node)
851 {
852 return Err(format!(
853 "phase 1 candidate may not mutate node `{}` outside the allowed field families",
854 baseline_node.node_id
855 ));
856 }
857 if normalized_output_contract(&baseline_node.output_contract)
858 != normalized_output_contract(&candidate_node.output_contract)
859 {
860 return Err(format!(
861 "phase 1 candidate may not mutate node `{}` output_contract outside summary_guidance",
862 baseline_node.node_id
863 ));
864 }
865 if normalized_retry_policy(&baseline_node.retry_policy)
866 != normalized_retry_policy(&candidate_node.retry_policy)
867 {
868 return Err(format!(
869 "phase 1 candidate may not mutate node `{}` retry_policy outside max_attempts/retries",
870 baseline_node.node_id
871 ));
872 }
873 if baseline_node.objective != candidate_node.objective {
874 if !phase1
875 .mutation_policy
876 .allowed_text_fields
877 .contains(&OptimizationMutableField::Objective)
878 {
879 return Err(format!(
880 "node `{}` objective is not allowed by the phase 1 mutation policy",
881 baseline_node.node_id
882 ));
883 }
884 changes.push(validate_text_mutation(
885 &baseline_node.node_id,
886 OptimizationMutableField::Objective,
887 &baseline_node.objective,
888 &candidate_node.objective,
889 &phase1.mutation_policy,
890 )?);
891 }
892 let baseline_summary = baseline_node
893 .output_contract
894 .as_ref()
895 .and_then(|contract| contract.summary_guidance.as_deref());
896 let candidate_summary = candidate_node
897 .output_contract
898 .as_ref()
899 .and_then(|contract| contract.summary_guidance.as_deref());
900 if baseline_summary != candidate_summary {
901 if !phase1
902 .mutation_policy
903 .allowed_text_fields
904 .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
905 {
906 return Err(format!(
907 "node `{}` output_contract.summary_guidance is not allowed by the phase 1 mutation policy",
908 baseline_node.node_id
909 ));
910 }
911 let before = baseline_summary.ok_or_else(|| {
912 format!(
913 "node `{}` output_contract.summary_guidance may not be created in phase 1",
914 baseline_node.node_id
915 )
916 })?;
917 let after = candidate_summary.ok_or_else(|| {
918 format!(
919 "node `{}` output_contract.summary_guidance may not be removed in phase 1",
920 baseline_node.node_id
921 )
922 })?;
923 changes.push(validate_text_mutation(
924 &baseline_node.node_id,
925 OptimizationMutableField::OutputContractSummaryGuidance,
926 before,
927 after,
928 &phase1.mutation_policy,
929 )?);
930 }
931 if baseline_node.timeout_ms != candidate_node.timeout_ms {
932 if !phase1
933 .mutation_policy
934 .allowed_knob_fields
935 .contains(&OptimizationMutableField::TimeoutMs)
936 {
937 return Err(format!(
938 "node `{}` timeout_ms is not allowed by the phase 1 mutation policy",
939 baseline_node.node_id
940 ));
941 }
942 changes.push(validate_timeout_mutation(
943 &baseline_node.node_id,
944 baseline_node.timeout_ms,
945 candidate_node.timeout_ms,
946 &phase1.mutation_policy,
947 )?);
948 }
949 for field in [
950 OptimizationMutableField::RetryPolicyMaxAttempts,
951 OptimizationMutableField::RetryPolicyRetries,
952 ] {
953 let before = retry_field_value(&baseline_node.retry_policy, field);
954 let after = retry_field_value(&candidate_node.retry_policy, field);
955 if before != after {
956 if !phase1.mutation_policy.allowed_knob_fields.contains(&field) {
957 return Err(format!(
958 "node `{}` {} is not allowed by the phase 1 mutation policy",
959 baseline_node.node_id,
960 mutable_field_label(field)
961 ));
962 }
963 changes.push(validate_retry_mutation(
964 &baseline_node.node_id,
965 field,
966 before,
967 after,
968 &phase1.mutation_policy,
969 )?);
970 }
971 }
972 }
973 if changes.is_empty() {
974 return Err("phase 1 candidate must change exactly one allowed field family".to_string());
975 }
976 let changed_nodes = changes
977 .iter()
978 .map(|change| change.node_id.as_str())
979 .collect::<std::collections::BTreeSet<_>>();
980 if changed_nodes.len() > phase1.mutation_policy.max_nodes_changed_per_candidate as usize {
981 return Err("phase 1 candidate may only change one node per experiment".to_string());
982 }
983 if changes.len()
984 > phase1
985 .mutation_policy
986 .max_field_families_changed_per_candidate as usize
987 {
988 return Err(
989 "phase 1 candidate may only change one field family per experiment".to_string(),
990 );
991 }
992 Ok(changes.into_iter().next().expect("non-empty change set"))
993}
994
995fn metric_f64(metrics: &Value, key: &str) -> Option<f64> {
996 metrics.get(key).and_then(Value::as_f64)
997}
998
999pub fn parse_phase1_metrics(metrics: &Value) -> Result<OptimizationPhase1Metrics, String> {
1000 let artifact_validator_pass_rate = metric_f64(metrics, "artifact_validator_pass_rate")
1001 .or_else(|| metric_f64(metrics, "validator_pass_rate"))
1002 .ok_or_else(|| "phase 1 metrics require artifact_validator_pass_rate".to_string())?;
1003 let unmet_requirement_count = metric_f64(metrics, "unmet_requirement_count")
1004 .ok_or_else(|| "phase 1 metrics require unmet_requirement_count".to_string())?;
1005 let blocked_node_rate = metric_f64(metrics, "blocked_node_rate")
1006 .ok_or_else(|| "phase 1 metrics require blocked_node_rate".to_string())?;
1007 let budget_within_limits = metrics
1008 .get("budget_within_limits")
1009 .and_then(Value::as_bool)
1010 .ok_or_else(|| "phase 1 metrics require budget_within_limits".to_string())?;
1011 Ok(OptimizationPhase1Metrics {
1012 artifact_validator_pass_rate,
1013 unmet_requirement_count,
1014 blocked_node_rate,
1015 budget_within_limits,
1016 })
1017}
1018
1019pub fn derive_phase1_metrics_from_run(
1020 run: &AutomationV2RunRecord,
1021 baseline_snapshot: &AutomationV2Spec,
1022 phase1: &OptimizationPhase1Config,
1023) -> Result<OptimizationPhase1Metrics, String> {
1024 let total_nodes = baseline_snapshot.flow.nodes.len().max(1) as f64;
1025 let validator_outputs = run
1026 .checkpoint
1027 .node_outputs
1028 .values()
1029 .filter_map(Value::as_object)
1030 .filter_map(|row| row.get("validator_summary"))
1031 .filter_map(Value::as_object)
1032 .collect::<Vec<_>>();
1033 if validator_outputs.is_empty() {
1034 return Err("automation run does not contain validator-backed outputs".to_string());
1035 }
1036 let passed_count = validator_outputs
1037 .iter()
1038 .filter(|summary| {
1039 summary
1040 .get("outcome")
1041 .and_then(Value::as_str)
1042 .is_some_and(|value| value.eq_ignore_ascii_case("passed"))
1043 })
1044 .count() as f64;
1045 let unmet_requirement_count = validator_outputs
1046 .iter()
1047 .map(|summary| {
1048 summary
1049 .get("unmet_requirements")
1050 .and_then(Value::as_array)
1051 .map(|rows| rows.len() as f64)
1052 .unwrap_or(0.0)
1053 })
1054 .sum::<f64>();
1055 let runtime_ms = run
1056 .finished_at_ms
1057 .or(Some(run.updated_at_ms))
1058 .unwrap_or(run.updated_at_ms)
1059 .saturating_sub(run.started_at_ms.unwrap_or(run.created_at_ms));
1060 let within_tokens = phase1
1061 .budget
1062 .max_total_tokens
1063 .is_none_or(|limit| run.total_tokens <= limit);
1064 let within_cost = phase1
1065 .budget
1066 .max_total_cost_usd
1067 .is_none_or(|limit| run.estimated_cost_usd <= limit);
1068 let within_runtime =
1069 runtime_ms <= (phase1.budget.max_runtime_minutes as u64).saturating_mul(60_000);
1070 Ok(OptimizationPhase1Metrics {
1071 artifact_validator_pass_rate: passed_count / validator_outputs.len() as f64,
1072 unmet_requirement_count,
1073 blocked_node_rate: run.checkpoint.blocked_nodes.len() as f64 / total_nodes,
1074 budget_within_limits: within_tokens && within_cost && within_runtime,
1075 })
1076}
1077
1078pub fn derive_phase1_validator_case_outcomes_from_run(
1079 run: &AutomationV2RunRecord,
1080) -> std::collections::BTreeMap<String, String> {
1081 run.checkpoint
1082 .node_outputs
1083 .iter()
1084 .filter_map(|(node_id, output)| {
1085 let outcome = output
1086 .as_object()
1087 .and_then(|row| row.get("validator_summary"))
1088 .and_then(Value::as_object)
1089 .and_then(|summary| summary.get("outcome"))
1090 .and_then(Value::as_str)
1091 .map(str::trim)
1092 .filter(|value| !value.is_empty())?;
1093 Some((node_id.clone(), outcome.to_ascii_lowercase()))
1094 })
1095 .collect()
1096}
1097
1098pub fn evaluate_phase1_promotion(
1099 baseline: &OptimizationPhase1Metrics,
1100 candidate: &OptimizationPhase1Metrics,
1101) -> OptimizationPromotionDecision {
1102 if !candidate.budget_within_limits {
1103 return OptimizationPromotionDecision {
1104 decision: OptimizationPromotionDecisionKind::Discard,
1105 reason: "candidate exceeded phase 1 budget ceilings".to_string(),
1106 };
1107 }
1108 if candidate.blocked_node_rate > baseline.blocked_node_rate {
1109 return OptimizationPromotionDecision {
1110 decision: OptimizationPromotionDecisionKind::Discard,
1111 reason: "candidate increased blocked_node_rate".to_string(),
1112 };
1113 }
1114 if candidate.artifact_validator_pass_rate > baseline.artifact_validator_pass_rate {
1115 return OptimizationPromotionDecision {
1116 decision: OptimizationPromotionDecisionKind::Promote,
1117 reason: "candidate improved artifact_validator_pass_rate".to_string(),
1118 };
1119 }
1120 if (candidate.artifact_validator_pass_rate - baseline.artifact_validator_pass_rate).abs()
1121 <= f64::EPSILON
1122 && candidate.unmet_requirement_count < baseline.unmet_requirement_count
1123 {
1124 return OptimizationPromotionDecision {
1125 decision: OptimizationPromotionDecisionKind::Promote,
1126 reason: "candidate improved unmet_requirement_count on a primary-metric tie"
1127 .to_string(),
1128 };
1129 }
1130 OptimizationPromotionDecision {
1131 decision: OptimizationPromotionDecisionKind::Discard,
1132 reason: "candidate did not beat the current phase 1 baseline".to_string(),
1133 }
1134}
1135
1136pub fn establish_phase1_baseline(
1137 replays: &[OptimizationBaselineReplayRecord],
1138 phase1: &OptimizationPhase1Config,
1139) -> Result<OptimizationPhase1Metrics, String> {
1140 let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1141 if replays.len() < required_runs {
1142 return Err(format!(
1143 "phase 1 baseline establishment requires at least {required_runs} replay runs"
1144 ));
1145 }
1146 let relevant = &replays[replays.len() - required_runs..];
1147 if relevant
1148 .iter()
1149 .any(|replay| !replay.phase1_metrics.budget_within_limits)
1150 {
1151 return Err("phase 1 baseline replay exceeded budget ceilings".to_string());
1152 }
1153 let validator_min = relevant
1154 .iter()
1155 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1156 .fold(f64::INFINITY, f64::min);
1157 let validator_max = relevant
1158 .iter()
1159 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1160 .fold(f64::NEG_INFINITY, f64::max);
1161 if validator_max - validator_min > 0.05 {
1162 return Err(
1163 "phase 1 baseline replay drift exceeded 5 percentage points for artifact_validator_pass_rate"
1164 .to_string(),
1165 );
1166 }
1167 let blocked_min = relevant
1168 .iter()
1169 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1170 .fold(f64::INFINITY, f64::min);
1171 let blocked_max = relevant
1172 .iter()
1173 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1174 .fold(f64::NEG_INFINITY, f64::max);
1175 if blocked_max - blocked_min > 0.05 {
1176 return Err(
1177 "phase 1 baseline replay drift exceeded 5 percentage points for blocked_node_rate"
1178 .to_string(),
1179 );
1180 }
1181 let mut case_outcomes =
1182 std::collections::BTreeMap::<String, std::collections::BTreeSet<String>>::new();
1183 for replay in relevant {
1184 for (case_id, outcome) in &replay.validator_case_outcomes {
1185 case_outcomes
1186 .entry(case_id.clone())
1187 .or_default()
1188 .insert(outcome.clone());
1189 }
1190 }
1191 let total_cases = case_outcomes.len();
1192 if total_cases > 0 {
1193 let flaky_cases = case_outcomes
1194 .values()
1195 .filter(|outcomes| outcomes.len() > 1)
1196 .count();
1197 if (flaky_cases as f64 / total_cases as f64) > 0.10 {
1198 return Err(format!(
1199 "phase 1 baseline replay flakiness exceeded 10% of validator cases ({flaky_cases}/{total_cases})"
1200 ));
1201 }
1202 }
1203 let count = relevant.len() as f64;
1204 Ok(OptimizationPhase1Metrics {
1205 artifact_validator_pass_rate: relevant
1206 .iter()
1207 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1208 .sum::<f64>()
1209 / count,
1210 unmet_requirement_count: relevant
1211 .iter()
1212 .map(|replay| replay.phase1_metrics.unmet_requirement_count)
1213 .sum::<f64>()
1214 / count,
1215 blocked_node_rate: relevant
1216 .iter()
1217 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1218 .sum::<f64>()
1219 / count,
1220 budget_within_limits: true,
1221 })
1222}
1223
1224pub fn phase1_baseline_replay_due(
1225 replays: &[OptimizationBaselineReplayRecord],
1226 pending_run_count: usize,
1227 phase1: &OptimizationPhase1Config,
1228 experiment_count: usize,
1229 now_ms: u64,
1230) -> bool {
1231 if pending_run_count > 0 {
1232 return false;
1233 }
1234 let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1235 if replays.len() < required_runs {
1236 return true;
1237 }
1238 let Some(last_replay) = replays.last() else {
1239 return true;
1240 };
1241 let candidate_interval = phase1.eval.baseline_replay_every_candidates.max(1) as usize;
1242 let candidates_since_last =
1243 experiment_count.saturating_sub(last_replay.experiment_count_at_recording as usize);
1244 if candidates_since_last >= candidate_interval {
1245 return true;
1246 }
1247 let replay_interval_ms = (phase1.eval.baseline_replay_every_minutes.max(1) as u64) * 60_000;
1248 now_ms.saturating_sub(last_replay.recorded_at_ms) >= replay_interval_ms
1249}
1250
1251#[cfg(test)]
1252mod tests {
1253 use super::*;
1254 use serde_json::json;
1255
1256 fn sample_phase1() -> OptimizationPhase1Config {
1257 OptimizationPhase1Config {
1258 objective_markdown: "improve output quality".to_string(),
1259 eval: OptimizationEvalSpec {
1260 pack_ref: "eval-pack.jsonl".to_string(),
1261 primary_metric: OptimizationMetricKind::ArtifactValidatorPassRate,
1262 secondary_metric: OptimizationMetricKind::UnmetRequirementCount,
1263 hard_guardrails: vec![
1264 OptimizationGuardrailKind::BlockedNodeRate,
1265 OptimizationGuardrailKind::BudgetCeilings,
1266 ],
1267 campaign_start_baseline_runs: 2,
1268 baseline_replay_every_candidates: 5,
1269 baseline_replay_every_minutes: 30,
1270 },
1271 mutation_policy: OptimizationMutationPolicy {
1272 max_nodes_changed_per_candidate: 1,
1273 max_field_families_changed_per_candidate: 1,
1274 allowed_text_fields: vec![
1275 OptimizationMutableField::Objective,
1276 OptimizationMutableField::OutputContractSummaryGuidance,
1277 ],
1278 allowed_knob_fields: vec![
1279 OptimizationMutableField::TimeoutMs,
1280 OptimizationMutableField::RetryPolicyMaxAttempts,
1281 OptimizationMutableField::RetryPolicyRetries,
1282 ],
1283 max_text_delta_chars: 300,
1284 max_text_delta_ratio: 0.25,
1285 timeout_delta_percent: 0.15,
1286 timeout_delta_ms: 30_000,
1287 timeout_min_ms: 30_000,
1288 timeout_max_ms: 600_000,
1289 retry_delta: 1,
1290 retry_min: 0,
1291 retry_max: 3,
1292 allow_text_and_knob_bundle: false,
1293 },
1294 scope: OptimizationSafetyScope {
1295 candidate_snapshot_only: true,
1296 allow_live_source_mutation: false,
1297 allow_external_side_effects_in_eval: false,
1298 promotion_requires_operator_approval: true,
1299 forbidden_fields: vec!["agents".to_string()],
1300 },
1301 budget: OptimizationBudgetPolicy {
1302 max_experiments: 10,
1303 max_runtime_minutes: 60,
1304 max_consecutive_failures: 3,
1305 max_total_tokens: Some(50_000),
1306 max_total_cost_usd: Some(10.0),
1307 },
1308 }
1309 }
1310
1311 fn sample_workflow() -> AutomationV2Spec {
1312 AutomationV2Spec {
1313 automation_id: "wf-opt".to_string(),
1314 name: "Optimization Target".to_string(),
1315 description: Some("workflow".to_string()),
1316 status: crate::AutomationV2Status::Draft,
1317 schedule: crate::AutomationV2Schedule {
1318 schedule_type: crate::AutomationV2ScheduleType::Manual,
1319 cron_expression: None,
1320 interval_seconds: None,
1321 timezone: "UTC".to_string(),
1322 misfire_policy: crate::RoutineMisfirePolicy::Skip,
1323 },
1324 agents: vec![crate::AutomationAgentProfile {
1325 agent_id: "agent-1".to_string(),
1326 template_id: None,
1327 display_name: "Worker".to_string(),
1328 avatar_url: None,
1329 model_policy: None,
1330 skills: Vec::new(),
1331 tool_policy: crate::AutomationAgentToolPolicy {
1332 allowlist: Vec::new(),
1333 denylist: Vec::new(),
1334 },
1335 mcp_policy: crate::AutomationAgentMcpPolicy {
1336 allowed_servers: Vec::new(),
1337 allowed_tools: None,
1338 },
1339 approval_policy: None,
1340 }],
1341 flow: crate::AutomationFlowSpec {
1342 nodes: vec![crate::AutomationFlowNode {
1343 node_id: "node-1".to_string(),
1344 agent_id: "agent-1".to_string(),
1345 objective: "Write a concise report for the user".to_string(),
1346 depends_on: Vec::new(),
1347 input_refs: Vec::new(),
1348 output_contract: Some(crate::AutomationFlowOutputContract {
1349 kind: "report".to_string(),
1350 validator: Some(crate::AutomationOutputValidatorKind::ResearchBrief),
1351 enforcement: None,
1352 schema: None,
1353 summary_guidance: Some("Summarize clearly.".to_string()),
1354 }),
1355 retry_policy: Some(json!({ "max_attempts": 1, "retries": 0 })),
1356 timeout_ms: Some(60_000),
1357 stage_kind: None,
1358 gate: None,
1359 metadata: None,
1360 }],
1361 },
1362 execution: crate::AutomationExecutionPolicy {
1363 max_parallel_agents: None,
1364 max_total_runtime_ms: None,
1365 max_total_tool_calls: None,
1366 max_total_tokens: None,
1367 max_total_cost_usd: None,
1368 },
1369 output_targets: Vec::new(),
1370 created_at_ms: 1,
1371 updated_at_ms: 1,
1372 creator_id: "test".to_string(),
1373 workspace_root: Some("/tmp/workflow".to_string()),
1374 metadata: None,
1375 next_fire_at_ms: None,
1376 last_fired_at_ms: None,
1377 }
1378 }
1379
1380 #[test]
1381 fn validate_phase1_candidate_accepts_single_objective_change() {
1382 let phase1 = sample_phase1();
1383 let baseline = sample_workflow();
1384 let mut candidate = baseline.clone();
1385 candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1386 let mutation =
1387 validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect("valid");
1388 assert_eq!(mutation.node_id, "node-1");
1389 assert_eq!(mutation.field, OptimizationMutableField::Objective);
1390 }
1391
1392 #[test]
1393 fn validate_phase1_candidate_rejects_mutation_bundle() {
1394 let phase1 = sample_phase1();
1395 let baseline = sample_workflow();
1396 let mut candidate = baseline.clone();
1397 candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1398 candidate.flow.nodes[0].timeout_ms = Some(65_000);
1399 let error =
1400 validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect_err("bundle");
1401 assert!(error.contains("one field family"));
1402 }
1403
1404 #[test]
1405 fn validate_phase1_candidate_rejects_oversize_text_delta() {
1406 let phase1 = sample_phase1();
1407 let baseline = sample_workflow();
1408 let mut candidate = baseline.clone();
1409 candidate.flow.nodes[0].objective = "x".repeat(400);
1410 let error = validate_phase1_candidate_mutation(&baseline, &candidate, &phase1)
1411 .expect_err("oversize");
1412 assert!(error.contains("max_text_delta_chars") || error.contains("max_text_delta_ratio"));
1413 }
1414
1415 #[test]
1416 fn evaluate_phase1_promotion_prefers_primary_metric() {
1417 let baseline = OptimizationPhase1Metrics {
1418 artifact_validator_pass_rate: 0.7,
1419 unmet_requirement_count: 2.0,
1420 blocked_node_rate: 0.1,
1421 budget_within_limits: true,
1422 };
1423 let candidate = OptimizationPhase1Metrics {
1424 artifact_validator_pass_rate: 0.8,
1425 unmet_requirement_count: 3.0,
1426 blocked_node_rate: 0.1,
1427 budget_within_limits: true,
1428 };
1429 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1430 assert_eq!(
1431 decision.decision,
1432 OptimizationPromotionDecisionKind::Promote
1433 );
1434 }
1435
1436 #[test]
1437 fn evaluate_phase1_promotion_uses_secondary_metric_on_tie() {
1438 let baseline = OptimizationPhase1Metrics {
1439 artifact_validator_pass_rate: 0.8,
1440 unmet_requirement_count: 2.0,
1441 blocked_node_rate: 0.1,
1442 budget_within_limits: true,
1443 };
1444 let candidate = OptimizationPhase1Metrics {
1445 artifact_validator_pass_rate: 0.8,
1446 unmet_requirement_count: 1.0,
1447 blocked_node_rate: 0.1,
1448 budget_within_limits: true,
1449 };
1450 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1451 assert_eq!(
1452 decision.decision,
1453 OptimizationPromotionDecisionKind::Promote
1454 );
1455 }
1456
1457 #[test]
1458 fn evaluate_phase1_promotion_rejects_guardrail_regression() {
1459 let baseline = OptimizationPhase1Metrics {
1460 artifact_validator_pass_rate: 0.8,
1461 unmet_requirement_count: 2.0,
1462 blocked_node_rate: 0.1,
1463 budget_within_limits: true,
1464 };
1465 let candidate = OptimizationPhase1Metrics {
1466 artifact_validator_pass_rate: 0.9,
1467 unmet_requirement_count: 1.0,
1468 blocked_node_rate: 0.2,
1469 budget_within_limits: true,
1470 };
1471 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1472 assert_eq!(
1473 decision.decision,
1474 OptimizationPromotionDecisionKind::Discard
1475 );
1476 }
1477
1478 #[test]
1479 fn establish_phase1_baseline_averages_stable_replays() {
1480 let phase1 = sample_phase1();
1481 let replays = vec![
1482 OptimizationBaselineReplayRecord {
1483 replay_id: "replay-1".to_string(),
1484 automation_run_id: None,
1485 phase1_metrics: OptimizationPhase1Metrics {
1486 artifact_validator_pass_rate: 0.8,
1487 unmet_requirement_count: 1.0,
1488 blocked_node_rate: 0.0,
1489 budget_within_limits: true,
1490 },
1491 validator_case_outcomes: std::collections::BTreeMap::from([(
1492 "node-1".to_string(),
1493 "passed".to_string(),
1494 )]),
1495 experiment_count_at_recording: 0,
1496 recorded_at_ms: 1,
1497 },
1498 OptimizationBaselineReplayRecord {
1499 replay_id: "replay-2".to_string(),
1500 automation_run_id: None,
1501 phase1_metrics: OptimizationPhase1Metrics {
1502 artifact_validator_pass_rate: 0.84,
1503 unmet_requirement_count: 2.0,
1504 blocked_node_rate: 0.02,
1505 budget_within_limits: true,
1506 },
1507 validator_case_outcomes: std::collections::BTreeMap::from([(
1508 "node-1".to_string(),
1509 "passed".to_string(),
1510 )]),
1511 experiment_count_at_recording: 0,
1512 recorded_at_ms: 2,
1513 },
1514 ];
1515 let baseline = establish_phase1_baseline(&replays, &phase1).expect("stable");
1516 assert!((baseline.artifact_validator_pass_rate - 0.82).abs() < 1e-9);
1517 assert!((baseline.unmet_requirement_count - 1.5).abs() < 1e-9);
1518 assert!((baseline.blocked_node_rate - 0.01).abs() < 1e-9);
1519 }
1520
1521 #[test]
1522 fn establish_phase1_baseline_rejects_validator_drift() {
1523 let phase1 = sample_phase1();
1524 let replays = vec![
1525 OptimizationBaselineReplayRecord {
1526 replay_id: "replay-1".to_string(),
1527 automation_run_id: None,
1528 phase1_metrics: OptimizationPhase1Metrics {
1529 artifact_validator_pass_rate: 0.8,
1530 unmet_requirement_count: 1.0,
1531 blocked_node_rate: 0.0,
1532 budget_within_limits: true,
1533 },
1534 validator_case_outcomes: std::collections::BTreeMap::from([(
1535 "node-1".to_string(),
1536 "passed".to_string(),
1537 )]),
1538 experiment_count_at_recording: 0,
1539 recorded_at_ms: 1,
1540 },
1541 OptimizationBaselineReplayRecord {
1542 replay_id: "replay-2".to_string(),
1543 automation_run_id: None,
1544 phase1_metrics: OptimizationPhase1Metrics {
1545 artifact_validator_pass_rate: 0.9,
1546 unmet_requirement_count: 1.0,
1547 blocked_node_rate: 0.0,
1548 budget_within_limits: true,
1549 },
1550 validator_case_outcomes: std::collections::BTreeMap::from([(
1551 "node-1".to_string(),
1552 "passed".to_string(),
1553 )]),
1554 experiment_count_at_recording: 0,
1555 recorded_at_ms: 2,
1556 },
1557 ];
1558 let error = establish_phase1_baseline(&replays, &phase1).expect_err("drift");
1559 assert!(error.contains("artifact_validator_pass_rate"));
1560 }
1561
1562 #[test]
1563 fn establish_phase1_baseline_rejects_flaky_validator_cases() {
1564 let phase1 = sample_phase1();
1565 let replays = vec![
1566 OptimizationBaselineReplayRecord {
1567 replay_id: "replay-1".to_string(),
1568 automation_run_id: None,
1569 phase1_metrics: OptimizationPhase1Metrics {
1570 artifact_validator_pass_rate: 1.0,
1571 unmet_requirement_count: 0.0,
1572 blocked_node_rate: 0.0,
1573 budget_within_limits: true,
1574 },
1575 validator_case_outcomes: std::collections::BTreeMap::from([
1576 ("node-1".to_string(), "passed".to_string()),
1577 ("node-2".to_string(), "passed".to_string()),
1578 ("node-3".to_string(), "passed".to_string()),
1579 ("node-4".to_string(), "passed".to_string()),
1580 ("node-5".to_string(), "passed".to_string()),
1581 ("node-6".to_string(), "passed".to_string()),
1582 ("node-7".to_string(), "passed".to_string()),
1583 ("node-8".to_string(), "passed".to_string()),
1584 ]),
1585 experiment_count_at_recording: 0,
1586 recorded_at_ms: 1,
1587 },
1588 OptimizationBaselineReplayRecord {
1589 replay_id: "replay-2".to_string(),
1590 automation_run_id: None,
1591 phase1_metrics: OptimizationPhase1Metrics {
1592 artifact_validator_pass_rate: 1.0,
1593 unmet_requirement_count: 0.0,
1594 blocked_node_rate: 0.0,
1595 budget_within_limits: true,
1596 },
1597 validator_case_outcomes: std::collections::BTreeMap::from([
1598 ("node-1".to_string(), "blocked".to_string()),
1599 ("node-2".to_string(), "blocked".to_string()),
1600 ("node-3".to_string(), "passed".to_string()),
1601 ("node-4".to_string(), "passed".to_string()),
1602 ("node-5".to_string(), "passed".to_string()),
1603 ("node-6".to_string(), "passed".to_string()),
1604 ("node-7".to_string(), "passed".to_string()),
1605 ("node-8".to_string(), "passed".to_string()),
1606 ]),
1607 experiment_count_at_recording: 0,
1608 recorded_at_ms: 2,
1609 },
1610 ];
1611 let error = establish_phase1_baseline(&replays, &phase1).expect_err("flaky");
1612 assert!(error.contains("flakiness"));
1613 }
1614
1615 #[test]
1616 fn derive_phase1_metrics_from_run_uses_validator_outputs_and_budget() {
1617 let phase1 = sample_phase1();
1618 let workflow = sample_workflow();
1619 let run = AutomationV2RunRecord {
1620 run_id: "run-1".to_string(),
1621 automation_id: workflow.automation_id.clone(),
1622 trigger_type: "manual".to_string(),
1623 status: crate::AutomationRunStatus::Completed,
1624 created_at_ms: 1,
1625 updated_at_ms: 10_000,
1626 started_at_ms: Some(1_000),
1627 finished_at_ms: Some(9_000),
1628 active_session_ids: Vec::new(),
1629 latest_session_id: None,
1630 active_instance_ids: Vec::new(),
1631 checkpoint: crate::AutomationRunCheckpoint {
1632 completed_nodes: vec!["node-1".to_string()],
1633 pending_nodes: Vec::new(),
1634 node_outputs: std::collections::HashMap::from([
1635 (
1636 "node-1".to_string(),
1637 json!({
1638 "validator_summary": {
1639 "outcome": "passed",
1640 "unmet_requirements": []
1641 }
1642 }),
1643 ),
1644 (
1645 "node-2".to_string(),
1646 json!({
1647 "validator_summary": {
1648 "outcome": "blocked",
1649 "unmet_requirements": ["citation_missing", "web_sources_reviewed_missing"]
1650 }
1651 }),
1652 ),
1653 ]),
1654 node_attempts: std::collections::HashMap::new(),
1655 blocked_nodes: vec!["node-2".to_string()],
1656 awaiting_gate: None,
1657 gate_history: Vec::new(),
1658 lifecycle_history: Vec::new(),
1659 last_failure: None,
1660 },
1661 automation_snapshot: Some(workflow.clone()),
1662 pause_reason: None,
1663 resume_reason: None,
1664 detail: None,
1665 stop_kind: None,
1666 stop_reason: None,
1667 prompt_tokens: 0,
1668 completion_tokens: 0,
1669 total_tokens: 100,
1670 estimated_cost_usd: 0.5,
1671 };
1672 let metrics = derive_phase1_metrics_from_run(&run, &workflow, &phase1).expect("metrics");
1673 assert!((metrics.artifact_validator_pass_rate - 0.5).abs() < 1e-9);
1674 assert!((metrics.unmet_requirement_count - 2.0).abs() < 1e-9);
1675 assert!((metrics.blocked_node_rate - 1.0).abs() < 1e-9);
1676 assert!(metrics.budget_within_limits);
1677 let case_outcomes = derive_phase1_validator_case_outcomes_from_run(&run);
1678 assert_eq!(
1679 case_outcomes.get("node-1").map(String::as_str),
1680 Some("passed")
1681 );
1682 assert_eq!(
1683 case_outcomes.get("node-2").map(String::as_str),
1684 Some("blocked")
1685 );
1686 }
1687
1688 #[test]
1689 fn phase1_baseline_replay_due_requires_initial_replays() {
1690 let phase1 = sample_phase1();
1691 assert!(phase1_baseline_replay_due(&[], 0, &phase1, 0, 0));
1692 assert!(!phase1_baseline_replay_due(&[], 1, &phase1, 0, 0));
1693 }
1694
1695 #[test]
1696 fn phase1_baseline_replay_due_uses_candidate_and_time_intervals() {
1697 let phase1 = sample_phase1();
1698 let replays = vec![
1699 OptimizationBaselineReplayRecord {
1700 replay_id: "replay-1".to_string(),
1701 automation_run_id: None,
1702 phase1_metrics: OptimizationPhase1Metrics {
1703 artifact_validator_pass_rate: 1.0,
1704 unmet_requirement_count: 0.0,
1705 blocked_node_rate: 0.0,
1706 budget_within_limits: true,
1707 },
1708 validator_case_outcomes: std::collections::BTreeMap::from([(
1709 "node-1".to_string(),
1710 "passed".to_string(),
1711 )]),
1712 experiment_count_at_recording: 2,
1713 recorded_at_ms: 1_000,
1714 },
1715 OptimizationBaselineReplayRecord {
1716 replay_id: "replay-2".to_string(),
1717 automation_run_id: None,
1718 phase1_metrics: OptimizationPhase1Metrics {
1719 artifact_validator_pass_rate: 1.0,
1720 unmet_requirement_count: 0.0,
1721 blocked_node_rate: 0.0,
1722 budget_within_limits: true,
1723 },
1724 validator_case_outcomes: std::collections::BTreeMap::from([(
1725 "node-1".to_string(),
1726 "passed".to_string(),
1727 )]),
1728 experiment_count_at_recording: 2,
1729 recorded_at_ms: 1_500,
1730 },
1731 ];
1732 assert!(phase1_baseline_replay_due(&replays, 0, &phase1, 7, 2_000));
1733 assert!(phase1_baseline_replay_due(
1734 &replays, 0, &phase1, 3, 1_801_500
1735 ));
1736 assert!(!phase1_baseline_replay_due(&replays, 0, &phase1, 3, 2_000));
1737 }
1738}