1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use sha2::{Digest, Sha256};
4use std::path::{Path, PathBuf};
5
6use crate::{AutomationV2RunRecord, AutomationV2Spec};
7
8#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "snake_case")]
10pub enum OptimizationTargetKind {
11 WorkflowV2PromptObjectiveOptimization,
12}
13
14#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
15#[serde(rename_all = "snake_case")]
16pub enum OptimizationCampaignStatus {
17 Draft,
18 Running,
19 AwaitingPromotionApproval,
20 PausedManual,
21 PausedBudget,
22 PausedEvaluatorUnstable,
23 Completed,
24 Failed,
25}
26
27#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
28#[serde(rename_all = "snake_case")]
29pub enum OptimizationExperimentStatus {
30 Draft,
31 Completed,
32 PromotionRecommended,
33 PromotionApproved,
34 PromotionRejected,
35 Discarded,
36 Failed,
37}
38
39#[derive(Debug, Clone, Serialize, Deserialize, Default)]
40pub struct OptimizationArtifactRefs {
41 pub objective_ref: String,
42 pub eval_ref: String,
43 pub mutation_policy_ref: String,
44 pub scope_ref: String,
45 pub budget_ref: String,
46 #[serde(default, skip_serializing_if = "Option::is_none")]
47 pub research_log_ref: Option<String>,
48 #[serde(default, skip_serializing_if = "Option::is_none")]
49 pub summary_ref: Option<String>,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct OptimizationFrozenArtifact {
54 pub artifact_ref: String,
55 pub resolved_path: String,
56 pub sha256: String,
57 pub size_bytes: u64,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct OptimizationFrozenArtifacts {
62 pub objective: OptimizationFrozenArtifact,
63 pub eval: OptimizationFrozenArtifact,
64 pub mutation_policy: OptimizationFrozenArtifact,
65 pub scope: OptimizationFrozenArtifact,
66 pub budget: OptimizationFrozenArtifact,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
70pub struct OptimizationExecutionOverride {
71 pub provider_id: String,
72 pub model_id: String,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
76#[serde(rename_all = "snake_case")]
77pub enum OptimizationMetricKind {
78 ArtifactValidatorPassRate,
79 UnmetRequirementCount,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
83#[serde(rename_all = "snake_case")]
84pub enum OptimizationGuardrailKind {
85 BlockedNodeRate,
86 BudgetCeilings,
87}
88
89#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
90#[serde(rename_all = "snake_case")]
91pub enum OptimizationMutableField {
92 Objective,
93 OutputContractSummaryGuidance,
94 TimeoutMs,
95 RetryPolicyMaxAttempts,
96 RetryPolicyRetries,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OptimizationEvalSpec {
101 pub pack_ref: String,
102 pub primary_metric: OptimizationMetricKind,
103 pub secondary_metric: OptimizationMetricKind,
104 #[serde(default)]
105 pub hard_guardrails: Vec<OptimizationGuardrailKind>,
106 pub campaign_start_baseline_runs: u32,
107 pub baseline_replay_every_candidates: u32,
108 pub baseline_replay_every_minutes: u32,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct OptimizationMutationPolicy {
113 pub max_nodes_changed_per_candidate: u32,
114 pub max_field_families_changed_per_candidate: u32,
115 #[serde(default)]
116 pub allowed_text_fields: Vec<OptimizationMutableField>,
117 #[serde(default)]
118 pub allowed_knob_fields: Vec<OptimizationMutableField>,
119 pub max_text_delta_chars: u32,
120 pub max_text_delta_ratio: f64,
121 pub timeout_delta_percent: f64,
122 pub timeout_delta_ms: u64,
123 pub timeout_min_ms: u64,
124 pub timeout_max_ms: u64,
125 pub retry_delta: i32,
126 pub retry_min: i32,
127 pub retry_max: i32,
128 #[serde(default)]
129 pub allow_text_and_knob_bundle: bool,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct OptimizationSafetyScope {
134 pub candidate_snapshot_only: bool,
135 pub allow_live_source_mutation: bool,
136 pub allow_external_side_effects_in_eval: bool,
137 pub promotion_requires_operator_approval: bool,
138 #[serde(default)]
139 pub forbidden_fields: Vec<String>,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub struct OptimizationBudgetPolicy {
144 pub max_experiments: u32,
145 pub max_runtime_minutes: u32,
146 pub max_consecutive_failures: u32,
147 #[serde(default, skip_serializing_if = "Option::is_none")]
148 pub max_total_tokens: Option<u64>,
149 #[serde(default, skip_serializing_if = "Option::is_none")]
150 pub max_total_cost_usd: Option<f64>,
151}
152
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct OptimizationPhase1Config {
155 pub objective_markdown: String,
156 pub eval: OptimizationEvalSpec,
157 pub mutation_policy: OptimizationMutationPolicy,
158 pub scope: OptimizationSafetyScope,
159 pub budget: OptimizationBudgetPolicy,
160}
161
162#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
163pub struct OptimizationPhase1Metrics {
164 pub artifact_validator_pass_rate: f64,
165 pub unmet_requirement_count: f64,
166 pub blocked_node_rate: f64,
167 pub budget_within_limits: bool,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
171pub struct OptimizationBaselineReplayRecord {
172 pub replay_id: String,
173 #[serde(default, skip_serializing_if = "Option::is_none")]
174 pub automation_run_id: Option<String>,
175 pub phase1_metrics: OptimizationPhase1Metrics,
176 #[serde(default)]
177 pub validator_case_outcomes: std::collections::BTreeMap<String, String>,
178 #[serde(default)]
179 pub experiment_count_at_recording: u64,
180 pub recorded_at_ms: u64,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
184#[serde(rename_all = "snake_case")]
185pub enum OptimizationPromotionDecisionKind {
186 Promote,
187 Discard,
188 NeedsOperatorReview,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
192pub struct OptimizationPromotionDecision {
193 pub decision: OptimizationPromotionDecisionKind,
194 pub reason: String,
195}
196
197#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
198pub struct OptimizationValidatedMutation {
199 pub node_id: String,
200 pub field: OptimizationMutableField,
201 pub summary: String,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct OptimizationCampaignRecord {
206 pub optimization_id: String,
207 pub name: String,
208 pub target_kind: OptimizationTargetKind,
209 pub status: OptimizationCampaignStatus,
210 pub source_workflow_id: String,
211 pub source_workflow_name: String,
212 pub source_workflow_snapshot: AutomationV2Spec,
213 pub source_workflow_snapshot_hash: String,
214 pub baseline_snapshot: AutomationV2Spec,
215 pub baseline_snapshot_hash: String,
216 #[serde(default, skip_serializing_if = "Option::is_none")]
217 pub execution_override: Option<OptimizationExecutionOverride>,
218 pub artifacts: OptimizationArtifactRefs,
219 pub frozen_artifacts: OptimizationFrozenArtifacts,
220 #[serde(default, skip_serializing_if = "Option::is_none")]
221 pub phase1: Option<OptimizationPhase1Config>,
222 #[serde(default, skip_serializing_if = "Option::is_none")]
223 pub baseline_metrics: Option<OptimizationPhase1Metrics>,
224 #[serde(default)]
225 pub baseline_replays: Vec<OptimizationBaselineReplayRecord>,
226 #[serde(default)]
227 pub pending_baseline_run_ids: Vec<String>,
228 #[serde(default, skip_serializing_if = "Option::is_none")]
229 pub pending_promotion_experiment_id: Option<String>,
230 #[serde(default, skip_serializing_if = "Option::is_none")]
231 pub last_pause_reason: Option<String>,
232 pub created_at_ms: u64,
233 pub updated_at_ms: u64,
234 #[serde(default, skip_serializing_if = "Option::is_none")]
235 pub metadata: Option<Value>,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct OptimizationExperimentRecord {
240 pub experiment_id: String,
241 pub optimization_id: String,
242 pub status: OptimizationExperimentStatus,
243 pub candidate_snapshot: AutomationV2Spec,
244 pub candidate_snapshot_hash: String,
245 pub baseline_snapshot_hash: String,
246 #[serde(default, skip_serializing_if = "Option::is_none")]
247 pub mutation_summary: Option<String>,
248 #[serde(default, skip_serializing_if = "Option::is_none")]
249 pub metrics: Option<Value>,
250 #[serde(default, skip_serializing_if = "Option::is_none")]
251 pub phase1_metrics: Option<OptimizationPhase1Metrics>,
252 #[serde(default, skip_serializing_if = "Option::is_none")]
253 pub promotion_recommendation: Option<String>,
254 #[serde(default, skip_serializing_if = "Option::is_none")]
255 pub promotion_decision: Option<OptimizationPromotionDecision>,
256 pub created_at_ms: u64,
257 pub updated_at_ms: u64,
258 #[serde(default, skip_serializing_if = "Option::is_none")]
259 pub metadata: Option<Value>,
260}
261
262pub fn optimization_snapshot_hash(snapshot: &AutomationV2Spec) -> String {
263 let canonical = serde_json::to_vec(snapshot).unwrap_or_default();
264 let mut hasher = Sha256::new();
265 hasher.update(canonical);
266 format!("{:x}", hasher.finalize())
267}
268
269pub fn apply_optimization_execution_override(
270 workflow: &AutomationV2Spec,
271 execution_override: &OptimizationExecutionOverride,
272) -> AutomationV2Spec {
273 let mut snapshot = workflow.clone();
274 for agent in &mut snapshot.agents {
275 let mut policy = agent
276 .model_policy
277 .clone()
278 .and_then(|value| value.as_object().cloned())
279 .unwrap_or_default();
280 let fixed_model = serde_json::json!({
281 "provider_id": execution_override.provider_id,
282 "model_id": execution_override.model_id,
283 });
284 policy.insert("default_model".to_string(), fixed_model.clone());
285 if let Some(role_models) = policy.get_mut("role_models").and_then(Value::as_object_mut) {
286 for role_model in role_models.values_mut() {
287 *role_model = fixed_model.clone();
288 }
289 }
290 agent.model_policy = Some(Value::Object(policy));
291 }
292 snapshot
293}
294
295pub fn freeze_optimization_artifact(
296 workspace_root: &str,
297 artifact_ref: &str,
298) -> Result<OptimizationFrozenArtifact, String> {
299 let trimmed = artifact_ref.trim();
300 if trimmed.is_empty() {
301 return Err("artifact ref is required".to_string());
302 }
303 let workspace = PathBuf::from(workspace_root);
304 let candidate = PathBuf::from(trimmed);
305 let resolved = if candidate.is_absolute() {
306 candidate
307 } else {
308 workspace.join(candidate)
309 };
310 if !tandem_core::is_within_workspace_root(&resolved, &workspace) {
311 return Err(format!(
312 "artifact `{trimmed}` must stay within workspace root `{workspace_root}`"
313 ));
314 }
315 let metadata = std::fs::metadata(&resolved)
316 .map_err(|_| format!("artifact `{trimmed}` does not exist or is unreadable"))?;
317 if !metadata.is_file() {
318 return Err(format!("artifact `{trimmed}` must be a file"));
319 }
320 let bytes =
321 std::fs::read(&resolved).map_err(|_| format!("artifact `{trimmed}` could not be read"))?;
322 let mut hasher = Sha256::new();
323 hasher.update(&bytes);
324 let resolved_path = resolved
325 .canonicalize()
326 .unwrap_or_else(|_| Path::new(&resolved).to_path_buf());
327 Ok(OptimizationFrozenArtifact {
328 artifact_ref: trimmed.to_string(),
329 resolved_path: resolved_path.to_string_lossy().to_string(),
330 sha256: format!("{:x}", hasher.finalize()),
331 size_bytes: metadata.len(),
332 })
333}
334
335fn read_optimization_artifact_text(
336 artifact: &OptimizationFrozenArtifact,
337 label: &str,
338) -> Result<String, String> {
339 std::fs::read_to_string(&artifact.resolved_path).map_err(|_| {
340 format!(
341 "{label} artifact `{}` could not be read as UTF-8 text",
342 artifact.artifact_ref
343 )
344 })
345}
346
347fn parse_yaml_artifact<T: for<'de> Deserialize<'de>>(
348 artifact: &OptimizationFrozenArtifact,
349 label: &str,
350) -> Result<T, String> {
351 let raw = read_optimization_artifact_text(artifact, label)?;
352 serde_yaml::from_str::<T>(&raw).map_err(|error| {
353 format!(
354 "failed to parse {label} artifact `{}`: {error}",
355 artifact.artifact_ref
356 )
357 })
358}
359
360fn validate_phase1_eval_spec(eval: &OptimizationEvalSpec) -> Result<(), String> {
361 if eval.pack_ref.trim().is_empty() {
362 return Err("eval pack_ref is required".to_string());
363 }
364 if eval.primary_metric != OptimizationMetricKind::ArtifactValidatorPassRate {
365 return Err("phase 1 eval.primary_metric must be artifact_validator_pass_rate".to_string());
366 }
367 if eval.secondary_metric != OptimizationMetricKind::UnmetRequirementCount {
368 return Err("phase 1 eval.secondary_metric must be unmet_requirement_count".to_string());
369 }
370 if eval.hard_guardrails.len() != 2
371 || !eval
372 .hard_guardrails
373 .contains(&OptimizationGuardrailKind::BlockedNodeRate)
374 || !eval
375 .hard_guardrails
376 .contains(&OptimizationGuardrailKind::BudgetCeilings)
377 {
378 return Err(
379 "phase 1 eval.hard_guardrails must be exactly blocked_node_rate and budget_ceilings"
380 .to_string(),
381 );
382 }
383 if eval.campaign_start_baseline_runs != 2 {
384 return Err("phase 1 campaign_start_baseline_runs must be 2".to_string());
385 }
386 if eval.baseline_replay_every_candidates != 5 {
387 return Err("phase 1 baseline_replay_every_candidates must be 5".to_string());
388 }
389 if eval.baseline_replay_every_minutes != 30 {
390 return Err("phase 1 baseline_replay_every_minutes must be 30".to_string());
391 }
392 Ok(())
393}
394
395fn validate_phase1_mutation_policy(policy: &OptimizationMutationPolicy) -> Result<(), String> {
396 if policy.max_nodes_changed_per_candidate != 1 {
397 return Err("phase 1 max_nodes_changed_per_candidate must be 1".to_string());
398 }
399 if policy.max_field_families_changed_per_candidate != 1 {
400 return Err("phase 1 max_field_families_changed_per_candidate must be 1".to_string());
401 }
402 if policy.allowed_text_fields.is_empty() && policy.allowed_knob_fields.is_empty() {
403 return Err("phase 1 mutation policy must allow at least one mutable field".to_string());
404 }
405 if policy.allowed_text_fields.iter().any(|field| {
406 !matches!(
407 field,
408 OptimizationMutableField::Objective
409 | OptimizationMutableField::OutputContractSummaryGuidance
410 )
411 }) {
412 return Err(
413 "phase 1 allowed_text_fields may only include objective or output_contract_summary_guidance"
414 .to_string(),
415 );
416 }
417 if policy.allowed_knob_fields.iter().any(|field| {
418 !matches!(
419 field,
420 OptimizationMutableField::TimeoutMs
421 | OptimizationMutableField::RetryPolicyMaxAttempts
422 | OptimizationMutableField::RetryPolicyRetries
423 )
424 }) {
425 return Err(
426 "phase 1 allowed_knob_fields may only include timeout_ms, retry_policy_max_attempts, or retry_policy_retries"
427 .to_string(),
428 );
429 }
430 if policy.max_text_delta_chars == 0 || policy.max_text_delta_chars > 300 {
431 return Err("phase 1 max_text_delta_chars must be between 1 and 300".to_string());
432 }
433 if !(0.0 < policy.max_text_delta_ratio && policy.max_text_delta_ratio <= 0.25) {
434 return Err("phase 1 max_text_delta_ratio must be > 0 and <= 0.25".to_string());
435 }
436 if !(0.0 < policy.timeout_delta_percent && policy.timeout_delta_percent <= 0.15) {
437 return Err("phase 1 timeout_delta_percent must be > 0 and <= 0.15".to_string());
438 }
439 if policy.timeout_delta_ms == 0 || policy.timeout_delta_ms > 30_000 {
440 return Err("phase 1 timeout_delta_ms must be between 1 and 30000".to_string());
441 }
442 if policy.timeout_min_ms < 30_000
443 || policy.timeout_max_ms > 600_000
444 || policy.timeout_min_ms >= policy.timeout_max_ms
445 {
446 return Err(
447 "phase 1 timeout bounds must stay within 30000..600000 ms and min < max".to_string(),
448 );
449 }
450 if policy.retry_delta <= 0 || policy.retry_delta > 1 {
451 return Err("phase 1 retry_delta must be 1".to_string());
452 }
453 if policy.retry_min < 0 || policy.retry_max > 3 || policy.retry_min > policy.retry_max {
454 return Err("phase 1 retry bounds must stay within 0..3".to_string());
455 }
456 if policy.allow_text_and_knob_bundle {
457 return Err("phase 1 may not bundle text and knob mutations".to_string());
458 }
459 Ok(())
460}
461
462fn validate_phase1_scope(scope: &OptimizationSafetyScope) -> Result<(), String> {
463 if !scope.candidate_snapshot_only {
464 return Err("phase 1 scope must require candidate_snapshot_only".to_string());
465 }
466 if scope.allow_live_source_mutation {
467 return Err("phase 1 scope must forbid live source mutation".to_string());
468 }
469 if scope.allow_external_side_effects_in_eval {
470 return Err("phase 1 scope must forbid external side effects in eval".to_string());
471 }
472 if !scope.promotion_requires_operator_approval {
473 return Err("phase 1 scope must require operator approval for promotion".to_string());
474 }
475 Ok(())
476}
477
478fn validate_phase1_budget(budget: &OptimizationBudgetPolicy) -> Result<(), String> {
479 if budget.max_experiments == 0 || budget.max_experiments > 100 {
480 return Err("phase 1 budget.max_experiments must be between 1 and 100".to_string());
481 }
482 if budget.max_runtime_minutes == 0 || budget.max_runtime_minutes > 1_440 {
483 return Err("phase 1 budget.max_runtime_minutes must be between 1 and 1440".to_string());
484 }
485 if budget.max_consecutive_failures == 0 || budget.max_consecutive_failures > 10 {
486 return Err("phase 1 budget.max_consecutive_failures must be between 1 and 10".to_string());
487 }
488 if let Some(max_cost_usd) = budget.max_total_cost_usd {
489 if !max_cost_usd.is_finite() || max_cost_usd <= 0.0 {
490 return Err("phase 1 budget.max_total_cost_usd must be positive".to_string());
491 }
492 }
493 if let Some(max_total_tokens) = budget.max_total_tokens {
494 if max_total_tokens == 0 {
495 return Err("phase 1 budget.max_total_tokens must be positive".to_string());
496 }
497 }
498 Ok(())
499}
500
501pub fn load_optimization_phase1_config(
502 frozen_artifacts: &OptimizationFrozenArtifacts,
503) -> Result<OptimizationPhase1Config, String> {
504 let objective_markdown =
505 read_optimization_artifact_text(&frozen_artifacts.objective, "objective")?;
506 if objective_markdown.trim().is_empty() {
507 return Err("objective artifact must not be empty".to_string());
508 }
509 let eval = parse_yaml_artifact::<OptimizationEvalSpec>(&frozen_artifacts.eval, "eval")?;
510 let mutation_policy = parse_yaml_artifact::<OptimizationMutationPolicy>(
511 &frozen_artifacts.mutation_policy,
512 "mutation policy",
513 )?;
514 let scope = parse_yaml_artifact::<OptimizationSafetyScope>(&frozen_artifacts.scope, "scope")?;
515 let budget =
516 parse_yaml_artifact::<OptimizationBudgetPolicy>(&frozen_artifacts.budget, "budget")?;
517 validate_phase1_eval_spec(&eval)?;
518 validate_phase1_mutation_policy(&mutation_policy)?;
519 validate_phase1_scope(&scope)?;
520 validate_phase1_budget(&budget)?;
521 Ok(OptimizationPhase1Config {
522 objective_markdown: objective_markdown.trim().to_string(),
523 eval,
524 mutation_policy,
525 scope,
526 budget,
527 })
528}
529
530fn workflow_has_simple_retry_field(
531 workflow: &AutomationV2Spec,
532 field: OptimizationMutableField,
533) -> bool {
534 workflow.flow.nodes.iter().any(|node| {
535 node.retry_policy
536 .as_ref()
537 .and_then(Value::as_object)
538 .and_then(|obj| match field {
539 OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
540 OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
541 _ => None,
542 })
543 .and_then(Value::as_i64)
544 .is_some()
545 })
546}
547
548pub fn validate_phase1_workflow_target(
549 workflow: &AutomationV2Spec,
550 phase1: &OptimizationPhase1Config,
551) -> Result<(), String> {
552 if workflow.flow.nodes.is_empty() {
553 return Err("phase 1 workflow target must contain at least one node".to_string());
554 }
555 if !workflow.flow.nodes.iter().any(|node| {
556 node.output_contract
557 .as_ref()
558 .and_then(|contract| contract.validator)
559 .is_some()
560 }) {
561 return Err(
562 "phase 1 workflow target must contain at least one validator-backed output contract"
563 .to_string(),
564 );
565 }
566 let has_objective_field = phase1
567 .mutation_policy
568 .allowed_text_fields
569 .contains(&OptimizationMutableField::Objective);
570 let has_summary_guidance_field = phase1
571 .mutation_policy
572 .allowed_text_fields
573 .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
574 && workflow.flow.nodes.iter().any(|node| {
575 node.output_contract
576 .as_ref()
577 .and_then(|contract| contract.summary_guidance.as_ref())
578 .is_some()
579 });
580 let has_timeout_field = phase1
581 .mutation_policy
582 .allowed_knob_fields
583 .contains(&OptimizationMutableField::TimeoutMs)
584 && workflow
585 .flow
586 .nodes
587 .iter()
588 .any(|node| node.timeout_ms.is_some());
589 let has_retry_field = phase1
590 .mutation_policy
591 .allowed_knob_fields
592 .iter()
593 .copied()
594 .filter(|field| {
595 matches!(
596 field,
597 OptimizationMutableField::RetryPolicyMaxAttempts
598 | OptimizationMutableField::RetryPolicyRetries
599 )
600 })
601 .any(|field| workflow_has_simple_retry_field(workflow, field));
602 if !(has_objective_field || has_summary_guidance_field || has_timeout_field || has_retry_field)
603 {
604 return Err(
605 "phase 1 workflow target does not expose any mutable fields allowed by the mutation policy"
606 .to_string(),
607 );
608 }
609 Ok(())
610}
611
612fn mutable_field_label(field: OptimizationMutableField) -> &'static str {
613 match field {
614 OptimizationMutableField::Objective => "objective",
615 OptimizationMutableField::OutputContractSummaryGuidance => {
616 "output_contract.summary_guidance"
617 }
618 OptimizationMutableField::TimeoutMs => "timeout_ms",
619 OptimizationMutableField::RetryPolicyMaxAttempts => "retry_policy.max_attempts",
620 OptimizationMutableField::RetryPolicyRetries => "retry_policy.retries",
621 }
622}
623
624fn json_value<T: Serialize>(value: &T) -> Value {
625 serde_json::to_value(value).unwrap_or(Value::Null)
626}
627
628fn normalized_workflow_without_flow(snapshot: &AutomationV2Spec) -> Value {
629 let mut value = json_value(snapshot);
630 if let Some(obj) = value.as_object_mut() {
631 obj.remove("flow");
632 }
633 value
634}
635
636fn normalized_node_static_fields(node: &crate::AutomationFlowNode) -> Value {
637 let mut value = json_value(node);
638 if let Some(obj) = value.as_object_mut() {
639 obj.remove("objective");
640 obj.remove("output_contract");
641 obj.remove("retry_policy");
642 obj.remove("timeout_ms");
643 }
644 value
645}
646
647fn normalized_output_contract(contract: &Option<crate::AutomationFlowOutputContract>) -> Value {
648 let mut value = json_value(contract);
649 if let Some(obj) = value.as_object_mut() {
650 obj.remove("summary_guidance");
651 }
652 value
653}
654
655fn normalized_retry_policy(policy: &Option<Value>) -> Value {
656 let mut value = policy.clone().unwrap_or(Value::Null);
657 if let Some(obj) = value.as_object_mut() {
658 obj.remove("max_attempts");
659 obj.remove("retries");
660 }
661 value
662}
663
664fn retry_field_value(policy: &Option<Value>, field: OptimizationMutableField) -> Option<i64> {
665 policy
666 .as_ref()
667 .and_then(Value::as_object)
668 .and_then(|obj| match field {
669 OptimizationMutableField::RetryPolicyMaxAttempts => obj.get("max_attempts"),
670 OptimizationMutableField::RetryPolicyRetries => obj.get("retries"),
671 _ => None,
672 })
673 .and_then(Value::as_i64)
674}
675
676fn text_delta_chars(before: &str, after: &str) -> usize {
677 let before_chars = before.chars().collect::<Vec<_>>();
678 let after_chars = after.chars().collect::<Vec<_>>();
679 let mut prefix = 0usize;
680 while prefix < before_chars.len()
681 && prefix < after_chars.len()
682 && before_chars[prefix] == after_chars[prefix]
683 {
684 prefix += 1;
685 }
686 let mut before_end = before_chars.len();
687 let mut after_end = after_chars.len();
688 while before_end > prefix
689 && after_end > prefix
690 && before_chars[before_end - 1] == after_chars[after_end - 1]
691 {
692 before_end -= 1;
693 after_end -= 1;
694 }
695 (before_end - prefix) + (after_end - prefix)
696}
697
698fn validate_text_mutation(
699 node_id: &str,
700 field: OptimizationMutableField,
701 before: &str,
702 after: &str,
703 policy: &OptimizationMutationPolicy,
704) -> Result<OptimizationValidatedMutation, String> {
705 if after.trim().is_empty() {
706 return Err(format!(
707 "node `{node_id}` {} must not become empty",
708 mutable_field_label(field)
709 ));
710 }
711 let delta_chars = text_delta_chars(before, after);
712 let baseline_len = before.chars().count().max(1);
713 let delta_ratio = delta_chars as f64 / baseline_len as f64;
714 if delta_chars == 0 {
715 return Err(format!(
716 "node `{node_id}` {} must change",
717 mutable_field_label(field)
718 ));
719 }
720 if delta_chars > policy.max_text_delta_chars as usize {
721 return Err(format!(
722 "node `{node_id}` {} exceeds phase 1 max_text_delta_chars",
723 mutable_field_label(field)
724 ));
725 }
726 if delta_ratio > policy.max_text_delta_ratio {
727 return Err(format!(
728 "node `{node_id}` {} exceeds phase 1 max_text_delta_ratio",
729 mutable_field_label(field)
730 ));
731 }
732 Ok(OptimizationValidatedMutation {
733 node_id: node_id.to_string(),
734 field,
735 summary: format!(
736 "mutate node `{node_id}` {} delta_chars={} delta_ratio={delta_ratio:.3}",
737 mutable_field_label(field),
738 delta_chars
739 ),
740 })
741}
742
743fn validate_timeout_mutation(
744 node_id: &str,
745 before: Option<u64>,
746 after: Option<u64>,
747 policy: &OptimizationMutationPolicy,
748) -> Result<OptimizationValidatedMutation, String> {
749 let before = before.ok_or_else(|| {
750 format!("node `{node_id}` timeout_ms is not mutable in phase 1 because it is absent")
751 })?;
752 let after = after
753 .ok_or_else(|| format!("node `{node_id}` timeout_ms may not be removed in phase 1"))?;
754 if after < policy.timeout_min_ms || after > policy.timeout_max_ms {
755 return Err(format!(
756 "node `{node_id}` timeout_ms must stay within {}..={} ms",
757 policy.timeout_min_ms, policy.timeout_max_ms
758 ));
759 }
760 let delta = after.abs_diff(before);
761 let allowed_percent_delta = ((before as f64) * policy.timeout_delta_percent).ceil() as u64;
762 if delta == 0 {
763 return Err(format!("node `{node_id}` timeout_ms must change"));
764 }
765 if delta > policy.timeout_delta_ms || delta > allowed_percent_delta {
766 return Err(format!(
767 "node `{node_id}` timeout_ms exceeds phase 1 timeout delta limits"
768 ));
769 }
770 Ok(OptimizationValidatedMutation {
771 node_id: node_id.to_string(),
772 field: OptimizationMutableField::TimeoutMs,
773 summary: format!(
774 "mutate node `{node_id}` timeout_ms from {before} to {after} (delta={delta})"
775 ),
776 })
777}
778
779fn validate_retry_mutation(
780 node_id: &str,
781 field: OptimizationMutableField,
782 before: Option<i64>,
783 after: Option<i64>,
784 policy: &OptimizationMutationPolicy,
785) -> Result<OptimizationValidatedMutation, String> {
786 let before = before.ok_or_else(|| {
787 format!(
788 "node `{node_id}` {} is not mutable in phase 1 because it is absent or non-integer",
789 mutable_field_label(field)
790 )
791 })?;
792 let after = after.ok_or_else(|| {
793 format!(
794 "node `{node_id}` {} may not be removed in phase 1",
795 mutable_field_label(field)
796 )
797 })?;
798 if after < policy.retry_min as i64 || after > policy.retry_max as i64 {
799 return Err(format!(
800 "node `{node_id}` {} must stay within {}..={}",
801 mutable_field_label(field),
802 policy.retry_min,
803 policy.retry_max
804 ));
805 }
806 let delta = (after - before).abs();
807 if delta == 0 {
808 return Err(format!(
809 "node `{node_id}` {} must change",
810 mutable_field_label(field)
811 ));
812 }
813 if delta > policy.retry_delta.abs() as i64 {
814 return Err(format!(
815 "node `{node_id}` {} exceeds phase 1 retry delta limit",
816 mutable_field_label(field)
817 ));
818 }
819 Ok(OptimizationValidatedMutation {
820 node_id: node_id.to_string(),
821 field,
822 summary: format!(
823 "mutate node `{node_id}` {} from {before} to {after}",
824 mutable_field_label(field)
825 ),
826 })
827}
828
829pub fn validate_phase1_candidate_mutation(
830 baseline: &AutomationV2Spec,
831 candidate: &AutomationV2Spec,
832 phase1: &OptimizationPhase1Config,
833) -> Result<OptimizationValidatedMutation, String> {
834 if normalized_workflow_without_flow(baseline) != normalized_workflow_without_flow(candidate) {
835 return Err(
836 "phase 1 candidate may not mutate workflow fields outside flow.nodes".to_string(),
837 );
838 }
839 if baseline.flow.nodes.len() != candidate.flow.nodes.len() {
840 return Err("phase 1 candidate may not add or remove workflow nodes".to_string());
841 }
842 let mut changes = Vec::new();
843 for (baseline_node, candidate_node) in
844 baseline.flow.nodes.iter().zip(candidate.flow.nodes.iter())
845 {
846 if baseline_node.node_id != candidate_node.node_id {
847 return Err("phase 1 candidate may not reorder or replace workflow nodes".to_string());
848 }
849 if normalized_node_static_fields(baseline_node)
850 != normalized_node_static_fields(candidate_node)
851 {
852 return Err(format!(
853 "phase 1 candidate may not mutate node `{}` outside the allowed field families",
854 baseline_node.node_id
855 ));
856 }
857 if normalized_output_contract(&baseline_node.output_contract)
858 != normalized_output_contract(&candidate_node.output_contract)
859 {
860 return Err(format!(
861 "phase 1 candidate may not mutate node `{}` output_contract outside summary_guidance",
862 baseline_node.node_id
863 ));
864 }
865 if normalized_retry_policy(&baseline_node.retry_policy)
866 != normalized_retry_policy(&candidate_node.retry_policy)
867 {
868 return Err(format!(
869 "phase 1 candidate may not mutate node `{}` retry_policy outside max_attempts/retries",
870 baseline_node.node_id
871 ));
872 }
873 if baseline_node.objective != candidate_node.objective {
874 if !phase1
875 .mutation_policy
876 .allowed_text_fields
877 .contains(&OptimizationMutableField::Objective)
878 {
879 return Err(format!(
880 "node `{}` objective is not allowed by the phase 1 mutation policy",
881 baseline_node.node_id
882 ));
883 }
884 changes.push(validate_text_mutation(
885 &baseline_node.node_id,
886 OptimizationMutableField::Objective,
887 &baseline_node.objective,
888 &candidate_node.objective,
889 &phase1.mutation_policy,
890 )?);
891 }
892 let baseline_summary = baseline_node
893 .output_contract
894 .as_ref()
895 .and_then(|contract| contract.summary_guidance.as_deref());
896 let candidate_summary = candidate_node
897 .output_contract
898 .as_ref()
899 .and_then(|contract| contract.summary_guidance.as_deref());
900 if baseline_summary != candidate_summary {
901 if !phase1
902 .mutation_policy
903 .allowed_text_fields
904 .contains(&OptimizationMutableField::OutputContractSummaryGuidance)
905 {
906 return Err(format!(
907 "node `{}` output_contract.summary_guidance is not allowed by the phase 1 mutation policy",
908 baseline_node.node_id
909 ));
910 }
911 let before = baseline_summary.ok_or_else(|| {
912 format!(
913 "node `{}` output_contract.summary_guidance may not be created in phase 1",
914 baseline_node.node_id
915 )
916 })?;
917 let after = candidate_summary.ok_or_else(|| {
918 format!(
919 "node `{}` output_contract.summary_guidance may not be removed in phase 1",
920 baseline_node.node_id
921 )
922 })?;
923 changes.push(validate_text_mutation(
924 &baseline_node.node_id,
925 OptimizationMutableField::OutputContractSummaryGuidance,
926 before,
927 after,
928 &phase1.mutation_policy,
929 )?);
930 }
931 if baseline_node.timeout_ms != candidate_node.timeout_ms {
932 if !phase1
933 .mutation_policy
934 .allowed_knob_fields
935 .contains(&OptimizationMutableField::TimeoutMs)
936 {
937 return Err(format!(
938 "node `{}` timeout_ms is not allowed by the phase 1 mutation policy",
939 baseline_node.node_id
940 ));
941 }
942 changes.push(validate_timeout_mutation(
943 &baseline_node.node_id,
944 baseline_node.timeout_ms,
945 candidate_node.timeout_ms,
946 &phase1.mutation_policy,
947 )?);
948 }
949 for field in [
950 OptimizationMutableField::RetryPolicyMaxAttempts,
951 OptimizationMutableField::RetryPolicyRetries,
952 ] {
953 let before = retry_field_value(&baseline_node.retry_policy, field);
954 let after = retry_field_value(&candidate_node.retry_policy, field);
955 if before != after {
956 if !phase1.mutation_policy.allowed_knob_fields.contains(&field) {
957 return Err(format!(
958 "node `{}` {} is not allowed by the phase 1 mutation policy",
959 baseline_node.node_id,
960 mutable_field_label(field)
961 ));
962 }
963 changes.push(validate_retry_mutation(
964 &baseline_node.node_id,
965 field,
966 before,
967 after,
968 &phase1.mutation_policy,
969 )?);
970 }
971 }
972 }
973 if changes.is_empty() {
974 return Err("phase 1 candidate must change exactly one allowed field family".to_string());
975 }
976 let changed_nodes = changes
977 .iter()
978 .map(|change| change.node_id.as_str())
979 .collect::<std::collections::BTreeSet<_>>();
980 if changed_nodes.len() > phase1.mutation_policy.max_nodes_changed_per_candidate as usize {
981 return Err("phase 1 candidate may only change one node per experiment".to_string());
982 }
983 if changes.len()
984 > phase1
985 .mutation_policy
986 .max_field_families_changed_per_candidate as usize
987 {
988 return Err(
989 "phase 1 candidate may only change one field family per experiment".to_string(),
990 );
991 }
992 Ok(changes.into_iter().next().expect("non-empty change set"))
993}
994
995fn metric_f64(metrics: &Value, key: &str) -> Option<f64> {
996 metrics.get(key).and_then(Value::as_f64)
997}
998
999pub fn parse_phase1_metrics(metrics: &Value) -> Result<OptimizationPhase1Metrics, String> {
1000 let artifact_validator_pass_rate = metric_f64(metrics, "artifact_validator_pass_rate")
1001 .or_else(|| metric_f64(metrics, "validator_pass_rate"))
1002 .ok_or_else(|| "phase 1 metrics require artifact_validator_pass_rate".to_string())?;
1003 let unmet_requirement_count = metric_f64(metrics, "unmet_requirement_count")
1004 .ok_or_else(|| "phase 1 metrics require unmet_requirement_count".to_string())?;
1005 let blocked_node_rate = metric_f64(metrics, "blocked_node_rate")
1006 .ok_or_else(|| "phase 1 metrics require blocked_node_rate".to_string())?;
1007 let budget_within_limits = metrics
1008 .get("budget_within_limits")
1009 .and_then(Value::as_bool)
1010 .ok_or_else(|| "phase 1 metrics require budget_within_limits".to_string())?;
1011 Ok(OptimizationPhase1Metrics {
1012 artifact_validator_pass_rate,
1013 unmet_requirement_count,
1014 blocked_node_rate,
1015 budget_within_limits,
1016 })
1017}
1018
1019pub fn derive_phase1_metrics_from_run(
1020 run: &AutomationV2RunRecord,
1021 baseline_snapshot: &AutomationV2Spec,
1022 phase1: &OptimizationPhase1Config,
1023) -> Result<OptimizationPhase1Metrics, String> {
1024 let total_nodes = baseline_snapshot.flow.nodes.len().max(1) as f64;
1025 let validator_outputs = run
1026 .checkpoint
1027 .node_outputs
1028 .values()
1029 .filter_map(Value::as_object)
1030 .filter_map(|row| row.get("validator_summary"))
1031 .filter_map(Value::as_object)
1032 .collect::<Vec<_>>();
1033 if validator_outputs.is_empty() {
1034 return Err("automation run does not contain validator-backed outputs".to_string());
1035 }
1036 let passed_count = validator_outputs
1037 .iter()
1038 .filter(|summary| {
1039 summary
1040 .get("outcome")
1041 .and_then(Value::as_str)
1042 .is_some_and(|value| value.eq_ignore_ascii_case("passed"))
1043 })
1044 .count() as f64;
1045 let unmet_requirement_count = validator_outputs
1046 .iter()
1047 .map(|summary| {
1048 summary
1049 .get("unmet_requirements")
1050 .and_then(Value::as_array)
1051 .map(|rows| rows.len() as f64)
1052 .unwrap_or(0.0)
1053 })
1054 .sum::<f64>();
1055 let runtime_ms = run
1056 .finished_at_ms
1057 .or(Some(run.updated_at_ms))
1058 .unwrap_or(run.updated_at_ms)
1059 .saturating_sub(run.started_at_ms.unwrap_or(run.created_at_ms));
1060 let within_tokens = phase1
1061 .budget
1062 .max_total_tokens
1063 .is_none_or(|limit| run.total_tokens <= limit);
1064 let within_cost = phase1
1065 .budget
1066 .max_total_cost_usd
1067 .is_none_or(|limit| run.estimated_cost_usd <= limit);
1068 let within_runtime =
1069 runtime_ms <= (phase1.budget.max_runtime_minutes as u64).saturating_mul(60_000);
1070 Ok(OptimizationPhase1Metrics {
1071 artifact_validator_pass_rate: passed_count / validator_outputs.len() as f64,
1072 unmet_requirement_count,
1073 blocked_node_rate: run.checkpoint.blocked_nodes.len() as f64 / total_nodes,
1074 budget_within_limits: within_tokens && within_cost && within_runtime,
1075 })
1076}
1077
1078pub fn derive_phase1_validator_case_outcomes_from_run(
1079 run: &AutomationV2RunRecord,
1080) -> std::collections::BTreeMap<String, String> {
1081 run.checkpoint
1082 .node_outputs
1083 .iter()
1084 .filter_map(|(node_id, output)| {
1085 let outcome = output
1086 .as_object()
1087 .and_then(|row| row.get("validator_summary"))
1088 .and_then(Value::as_object)
1089 .and_then(|summary| summary.get("outcome"))
1090 .and_then(Value::as_str)
1091 .map(str::trim)
1092 .filter(|value| !value.is_empty())?;
1093 Some((node_id.clone(), outcome.to_ascii_lowercase()))
1094 })
1095 .collect()
1096}
1097
1098pub fn evaluate_phase1_promotion(
1099 baseline: &OptimizationPhase1Metrics,
1100 candidate: &OptimizationPhase1Metrics,
1101) -> OptimizationPromotionDecision {
1102 if !candidate.budget_within_limits {
1103 return OptimizationPromotionDecision {
1104 decision: OptimizationPromotionDecisionKind::Discard,
1105 reason: "candidate exceeded phase 1 budget ceilings".to_string(),
1106 };
1107 }
1108 if candidate.blocked_node_rate > baseline.blocked_node_rate {
1109 return OptimizationPromotionDecision {
1110 decision: OptimizationPromotionDecisionKind::Discard,
1111 reason: "candidate increased blocked_node_rate".to_string(),
1112 };
1113 }
1114 if candidate.artifact_validator_pass_rate > baseline.artifact_validator_pass_rate {
1115 return OptimizationPromotionDecision {
1116 decision: OptimizationPromotionDecisionKind::Promote,
1117 reason: "candidate improved artifact_validator_pass_rate".to_string(),
1118 };
1119 }
1120 if (candidate.artifact_validator_pass_rate - baseline.artifact_validator_pass_rate).abs()
1121 <= f64::EPSILON
1122 && candidate.unmet_requirement_count < baseline.unmet_requirement_count
1123 {
1124 return OptimizationPromotionDecision {
1125 decision: OptimizationPromotionDecisionKind::Promote,
1126 reason: "candidate improved unmet_requirement_count on a primary-metric tie"
1127 .to_string(),
1128 };
1129 }
1130 OptimizationPromotionDecision {
1131 decision: OptimizationPromotionDecisionKind::Discard,
1132 reason: "candidate did not beat the current phase 1 baseline".to_string(),
1133 }
1134}
1135
1136pub fn establish_phase1_baseline(
1137 replays: &[OptimizationBaselineReplayRecord],
1138 phase1: &OptimizationPhase1Config,
1139) -> Result<OptimizationPhase1Metrics, String> {
1140 let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1141 if replays.len() < required_runs {
1142 return Err(format!(
1143 "phase 1 baseline establishment requires at least {required_runs} replay runs"
1144 ));
1145 }
1146 let relevant = &replays[replays.len() - required_runs..];
1147 if relevant
1148 .iter()
1149 .any(|replay| !replay.phase1_metrics.budget_within_limits)
1150 {
1151 return Err("phase 1 baseline replay exceeded budget ceilings".to_string());
1152 }
1153 let validator_min = relevant
1154 .iter()
1155 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1156 .fold(f64::INFINITY, f64::min);
1157 let validator_max = relevant
1158 .iter()
1159 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1160 .fold(f64::NEG_INFINITY, f64::max);
1161 if validator_max - validator_min > 0.05 {
1162 return Err(
1163 "phase 1 baseline replay drift exceeded 5 percentage points for artifact_validator_pass_rate"
1164 .to_string(),
1165 );
1166 }
1167 let blocked_min = relevant
1168 .iter()
1169 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1170 .fold(f64::INFINITY, f64::min);
1171 let blocked_max = relevant
1172 .iter()
1173 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1174 .fold(f64::NEG_INFINITY, f64::max);
1175 if blocked_max - blocked_min > 0.05 {
1176 return Err(
1177 "phase 1 baseline replay drift exceeded 5 percentage points for blocked_node_rate"
1178 .to_string(),
1179 );
1180 }
1181 let mut case_outcomes =
1182 std::collections::BTreeMap::<String, std::collections::BTreeSet<String>>::new();
1183 for replay in relevant {
1184 for (case_id, outcome) in &replay.validator_case_outcomes {
1185 case_outcomes
1186 .entry(case_id.clone())
1187 .or_default()
1188 .insert(outcome.clone());
1189 }
1190 }
1191 let total_cases = case_outcomes.len();
1192 if total_cases > 0 {
1193 let flaky_cases = case_outcomes
1194 .values()
1195 .filter(|outcomes| outcomes.len() > 1)
1196 .count();
1197 if (flaky_cases as f64 / total_cases as f64) > 0.10 {
1198 return Err(format!(
1199 "phase 1 baseline replay flakiness exceeded 10% of validator cases ({flaky_cases}/{total_cases})"
1200 ));
1201 }
1202 }
1203 let count = relevant.len() as f64;
1204 Ok(OptimizationPhase1Metrics {
1205 artifact_validator_pass_rate: relevant
1206 .iter()
1207 .map(|replay| replay.phase1_metrics.artifact_validator_pass_rate)
1208 .sum::<f64>()
1209 / count,
1210 unmet_requirement_count: relevant
1211 .iter()
1212 .map(|replay| replay.phase1_metrics.unmet_requirement_count)
1213 .sum::<f64>()
1214 / count,
1215 blocked_node_rate: relevant
1216 .iter()
1217 .map(|replay| replay.phase1_metrics.blocked_node_rate)
1218 .sum::<f64>()
1219 / count,
1220 budget_within_limits: true,
1221 })
1222}
1223
1224pub fn phase1_baseline_replay_due(
1225 replays: &[OptimizationBaselineReplayRecord],
1226 pending_run_count: usize,
1227 phase1: &OptimizationPhase1Config,
1228 experiment_count: usize,
1229 now_ms: u64,
1230) -> bool {
1231 if pending_run_count > 0 {
1232 return false;
1233 }
1234 let required_runs = phase1.eval.campaign_start_baseline_runs.max(1) as usize;
1235 if replays.len() < required_runs {
1236 return true;
1237 }
1238 let Some(last_replay) = replays.last() else {
1239 return true;
1240 };
1241 let candidate_interval = phase1.eval.baseline_replay_every_candidates.max(1) as usize;
1242 let candidates_since_last =
1243 experiment_count.saturating_sub(last_replay.experiment_count_at_recording as usize);
1244 if candidates_since_last >= candidate_interval {
1245 return true;
1246 }
1247 let replay_interval_ms = (phase1.eval.baseline_replay_every_minutes.max(1) as u64) * 60_000;
1248 now_ms.saturating_sub(last_replay.recorded_at_ms) >= replay_interval_ms
1249}
1250
1251#[cfg(test)]
1252mod tests {
1253 use super::*;
1254 use serde_json::json;
1255
1256 fn sample_phase1() -> OptimizationPhase1Config {
1257 OptimizationPhase1Config {
1258 objective_markdown: "improve output quality".to_string(),
1259 eval: OptimizationEvalSpec {
1260 pack_ref: "eval-pack.jsonl".to_string(),
1261 primary_metric: OptimizationMetricKind::ArtifactValidatorPassRate,
1262 secondary_metric: OptimizationMetricKind::UnmetRequirementCount,
1263 hard_guardrails: vec![
1264 OptimizationGuardrailKind::BlockedNodeRate,
1265 OptimizationGuardrailKind::BudgetCeilings,
1266 ],
1267 campaign_start_baseline_runs: 2,
1268 baseline_replay_every_candidates: 5,
1269 baseline_replay_every_minutes: 30,
1270 },
1271 mutation_policy: OptimizationMutationPolicy {
1272 max_nodes_changed_per_candidate: 1,
1273 max_field_families_changed_per_candidate: 1,
1274 allowed_text_fields: vec![
1275 OptimizationMutableField::Objective,
1276 OptimizationMutableField::OutputContractSummaryGuidance,
1277 ],
1278 allowed_knob_fields: vec![
1279 OptimizationMutableField::TimeoutMs,
1280 OptimizationMutableField::RetryPolicyMaxAttempts,
1281 OptimizationMutableField::RetryPolicyRetries,
1282 ],
1283 max_text_delta_chars: 300,
1284 max_text_delta_ratio: 0.25,
1285 timeout_delta_percent: 0.15,
1286 timeout_delta_ms: 30_000,
1287 timeout_min_ms: 30_000,
1288 timeout_max_ms: 600_000,
1289 retry_delta: 1,
1290 retry_min: 0,
1291 retry_max: 3,
1292 allow_text_and_knob_bundle: false,
1293 },
1294 scope: OptimizationSafetyScope {
1295 candidate_snapshot_only: true,
1296 allow_live_source_mutation: false,
1297 allow_external_side_effects_in_eval: false,
1298 promotion_requires_operator_approval: true,
1299 forbidden_fields: vec!["agents".to_string()],
1300 },
1301 budget: OptimizationBudgetPolicy {
1302 max_experiments: 10,
1303 max_runtime_minutes: 60,
1304 max_consecutive_failures: 3,
1305 max_total_tokens: Some(50_000),
1306 max_total_cost_usd: Some(10.0),
1307 },
1308 }
1309 }
1310
1311 fn sample_workflow() -> AutomationV2Spec {
1312 AutomationV2Spec {
1313 automation_id: "wf-opt".to_string(),
1314 name: "Optimization Target".to_string(),
1315 description: Some("workflow".to_string()),
1316 status: crate::AutomationV2Status::Draft,
1317 schedule: crate::AutomationV2Schedule {
1318 schedule_type: crate::AutomationV2ScheduleType::Manual,
1319 cron_expression: None,
1320 interval_seconds: None,
1321 timezone: "UTC".to_string(),
1322 misfire_policy: crate::RoutineMisfirePolicy::Skip,
1323 },
1324 knowledge: tandem_orchestrator::KnowledgeBinding::default(),
1325 agents: vec![crate::AutomationAgentProfile {
1326 agent_id: "agent-1".to_string(),
1327 template_id: None,
1328 display_name: "Worker".to_string(),
1329 avatar_url: None,
1330 model_policy: None,
1331 skills: Vec::new(),
1332 tool_policy: crate::AutomationAgentToolPolicy {
1333 allowlist: Vec::new(),
1334 denylist: Vec::new(),
1335 },
1336 mcp_policy: crate::AutomationAgentMcpPolicy {
1337 allowed_servers: Vec::new(),
1338 allowed_tools: None,
1339 },
1340 approval_policy: None,
1341 }],
1342 flow: crate::AutomationFlowSpec {
1343 nodes: vec![crate::AutomationFlowNode {
1344 knowledge: tandem_orchestrator::KnowledgeBinding::default(),
1345 node_id: "node-1".to_string(),
1346 agent_id: "agent-1".to_string(),
1347 objective: "Write a concise report for the user".to_string(),
1348 depends_on: Vec::new(),
1349 input_refs: Vec::new(),
1350 output_contract: Some(crate::AutomationFlowOutputContract {
1351 kind: "report".to_string(),
1352 validator: Some(crate::AutomationOutputValidatorKind::ResearchBrief),
1353 enforcement: None,
1354 schema: None,
1355 summary_guidance: Some("Summarize clearly.".to_string()),
1356 }),
1357 retry_policy: Some(json!({ "max_attempts": 1, "retries": 0 })),
1358 timeout_ms: Some(60_000),
1359 max_tool_calls: None,
1360 stage_kind: None,
1361 gate: None,
1362 metadata: None,
1363 }],
1364 },
1365 execution: crate::AutomationExecutionPolicy {
1366 max_parallel_agents: None,
1367 max_total_runtime_ms: None,
1368 max_total_tool_calls: None,
1369 max_total_tokens: None,
1370 max_total_cost_usd: None,
1371 },
1372 output_targets: Vec::new(),
1373 created_at_ms: 1,
1374 updated_at_ms: 1,
1375 creator_id: "test".to_string(),
1376 workspace_root: Some("/tmp/workflow".to_string()),
1377 metadata: None,
1378 next_fire_at_ms: None,
1379 last_fired_at_ms: None,
1380 scope_policy: None,
1381 watch_conditions: Vec::new(),
1382 handoff_config: None,
1383 }
1384 }
1385
1386 #[test]
1387 fn validate_phase1_candidate_accepts_single_objective_change() {
1388 let phase1 = sample_phase1();
1389 let baseline = sample_workflow();
1390 let mut candidate = baseline.clone();
1391 candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1392 let mutation =
1393 validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect("valid");
1394 assert_eq!(mutation.node_id, "node-1");
1395 assert_eq!(mutation.field, OptimizationMutableField::Objective);
1396 }
1397
1398 #[test]
1399 fn validate_phase1_candidate_rejects_mutation_bundle() {
1400 let phase1 = sample_phase1();
1401 let baseline = sample_workflow();
1402 let mut candidate = baseline.clone();
1403 candidate.flow.nodes[0].objective = "Write a concise report for the team".to_string();
1404 candidate.flow.nodes[0].timeout_ms = Some(65_000);
1405 let error =
1406 validate_phase1_candidate_mutation(&baseline, &candidate, &phase1).expect_err("bundle");
1407 assert!(error.contains("one field family"));
1408 }
1409
1410 #[test]
1411 fn validate_phase1_candidate_rejects_oversize_text_delta() {
1412 let phase1 = sample_phase1();
1413 let baseline = sample_workflow();
1414 let mut candidate = baseline.clone();
1415 candidate.flow.nodes[0].objective = "x".repeat(400);
1416 let error = validate_phase1_candidate_mutation(&baseline, &candidate, &phase1)
1417 .expect_err("oversize");
1418 assert!(error.contains("max_text_delta_chars") || error.contains("max_text_delta_ratio"));
1419 }
1420
1421 #[test]
1422 fn evaluate_phase1_promotion_prefers_primary_metric() {
1423 let baseline = OptimizationPhase1Metrics {
1424 artifact_validator_pass_rate: 0.7,
1425 unmet_requirement_count: 2.0,
1426 blocked_node_rate: 0.1,
1427 budget_within_limits: true,
1428 };
1429 let candidate = OptimizationPhase1Metrics {
1430 artifact_validator_pass_rate: 0.8,
1431 unmet_requirement_count: 3.0,
1432 blocked_node_rate: 0.1,
1433 budget_within_limits: true,
1434 };
1435 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1436 assert_eq!(
1437 decision.decision,
1438 OptimizationPromotionDecisionKind::Promote
1439 );
1440 }
1441
1442 #[test]
1443 fn evaluate_phase1_promotion_uses_secondary_metric_on_tie() {
1444 let baseline = OptimizationPhase1Metrics {
1445 artifact_validator_pass_rate: 0.8,
1446 unmet_requirement_count: 2.0,
1447 blocked_node_rate: 0.1,
1448 budget_within_limits: true,
1449 };
1450 let candidate = OptimizationPhase1Metrics {
1451 artifact_validator_pass_rate: 0.8,
1452 unmet_requirement_count: 1.0,
1453 blocked_node_rate: 0.1,
1454 budget_within_limits: true,
1455 };
1456 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1457 assert_eq!(
1458 decision.decision,
1459 OptimizationPromotionDecisionKind::Promote
1460 );
1461 }
1462
1463 #[test]
1464 fn evaluate_phase1_promotion_rejects_guardrail_regression() {
1465 let baseline = OptimizationPhase1Metrics {
1466 artifact_validator_pass_rate: 0.8,
1467 unmet_requirement_count: 2.0,
1468 blocked_node_rate: 0.1,
1469 budget_within_limits: true,
1470 };
1471 let candidate = OptimizationPhase1Metrics {
1472 artifact_validator_pass_rate: 0.9,
1473 unmet_requirement_count: 1.0,
1474 blocked_node_rate: 0.2,
1475 budget_within_limits: true,
1476 };
1477 let decision = evaluate_phase1_promotion(&baseline, &candidate);
1478 assert_eq!(
1479 decision.decision,
1480 OptimizationPromotionDecisionKind::Discard
1481 );
1482 }
1483
1484 #[test]
1485 fn establish_phase1_baseline_averages_stable_replays() {
1486 let phase1 = sample_phase1();
1487 let replays = vec![
1488 OptimizationBaselineReplayRecord {
1489 replay_id: "replay-1".to_string(),
1490 automation_run_id: None,
1491 phase1_metrics: OptimizationPhase1Metrics {
1492 artifact_validator_pass_rate: 0.8,
1493 unmet_requirement_count: 1.0,
1494 blocked_node_rate: 0.0,
1495 budget_within_limits: true,
1496 },
1497 validator_case_outcomes: std::collections::BTreeMap::from([(
1498 "node-1".to_string(),
1499 "passed".to_string(),
1500 )]),
1501 experiment_count_at_recording: 0,
1502 recorded_at_ms: 1,
1503 },
1504 OptimizationBaselineReplayRecord {
1505 replay_id: "replay-2".to_string(),
1506 automation_run_id: None,
1507 phase1_metrics: OptimizationPhase1Metrics {
1508 artifact_validator_pass_rate: 0.84,
1509 unmet_requirement_count: 2.0,
1510 blocked_node_rate: 0.02,
1511 budget_within_limits: true,
1512 },
1513 validator_case_outcomes: std::collections::BTreeMap::from([(
1514 "node-1".to_string(),
1515 "passed".to_string(),
1516 )]),
1517 experiment_count_at_recording: 0,
1518 recorded_at_ms: 2,
1519 },
1520 ];
1521 let baseline = establish_phase1_baseline(&replays, &phase1).expect("stable");
1522 assert!((baseline.artifact_validator_pass_rate - 0.82).abs() < 1e-9);
1523 assert!((baseline.unmet_requirement_count - 1.5).abs() < 1e-9);
1524 assert!((baseline.blocked_node_rate - 0.01).abs() < 1e-9);
1525 }
1526
1527 #[test]
1528 fn establish_phase1_baseline_rejects_validator_drift() {
1529 let phase1 = sample_phase1();
1530 let replays = vec![
1531 OptimizationBaselineReplayRecord {
1532 replay_id: "replay-1".to_string(),
1533 automation_run_id: None,
1534 phase1_metrics: OptimizationPhase1Metrics {
1535 artifact_validator_pass_rate: 0.8,
1536 unmet_requirement_count: 1.0,
1537 blocked_node_rate: 0.0,
1538 budget_within_limits: true,
1539 },
1540 validator_case_outcomes: std::collections::BTreeMap::from([(
1541 "node-1".to_string(),
1542 "passed".to_string(),
1543 )]),
1544 experiment_count_at_recording: 0,
1545 recorded_at_ms: 1,
1546 },
1547 OptimizationBaselineReplayRecord {
1548 replay_id: "replay-2".to_string(),
1549 automation_run_id: None,
1550 phase1_metrics: OptimizationPhase1Metrics {
1551 artifact_validator_pass_rate: 0.9,
1552 unmet_requirement_count: 1.0,
1553 blocked_node_rate: 0.0,
1554 budget_within_limits: true,
1555 },
1556 validator_case_outcomes: std::collections::BTreeMap::from([(
1557 "node-1".to_string(),
1558 "passed".to_string(),
1559 )]),
1560 experiment_count_at_recording: 0,
1561 recorded_at_ms: 2,
1562 },
1563 ];
1564 let error = establish_phase1_baseline(&replays, &phase1).expect_err("drift");
1565 assert!(error.contains("artifact_validator_pass_rate"));
1566 }
1567
1568 #[test]
1569 fn establish_phase1_baseline_rejects_flaky_validator_cases() {
1570 let phase1 = sample_phase1();
1571 let replays = vec![
1572 OptimizationBaselineReplayRecord {
1573 replay_id: "replay-1".to_string(),
1574 automation_run_id: None,
1575 phase1_metrics: OptimizationPhase1Metrics {
1576 artifact_validator_pass_rate: 1.0,
1577 unmet_requirement_count: 0.0,
1578 blocked_node_rate: 0.0,
1579 budget_within_limits: true,
1580 },
1581 validator_case_outcomes: std::collections::BTreeMap::from([
1582 ("node-1".to_string(), "passed".to_string()),
1583 ("node-2".to_string(), "passed".to_string()),
1584 ("node-3".to_string(), "passed".to_string()),
1585 ("node-4".to_string(), "passed".to_string()),
1586 ("node-5".to_string(), "passed".to_string()),
1587 ("node-6".to_string(), "passed".to_string()),
1588 ("node-7".to_string(), "passed".to_string()),
1589 ("node-8".to_string(), "passed".to_string()),
1590 ]),
1591 experiment_count_at_recording: 0,
1592 recorded_at_ms: 1,
1593 },
1594 OptimizationBaselineReplayRecord {
1595 replay_id: "replay-2".to_string(),
1596 automation_run_id: None,
1597 phase1_metrics: OptimizationPhase1Metrics {
1598 artifact_validator_pass_rate: 1.0,
1599 unmet_requirement_count: 0.0,
1600 blocked_node_rate: 0.0,
1601 budget_within_limits: true,
1602 },
1603 validator_case_outcomes: std::collections::BTreeMap::from([
1604 ("node-1".to_string(), "blocked".to_string()),
1605 ("node-2".to_string(), "blocked".to_string()),
1606 ("node-3".to_string(), "passed".to_string()),
1607 ("node-4".to_string(), "passed".to_string()),
1608 ("node-5".to_string(), "passed".to_string()),
1609 ("node-6".to_string(), "passed".to_string()),
1610 ("node-7".to_string(), "passed".to_string()),
1611 ("node-8".to_string(), "passed".to_string()),
1612 ]),
1613 experiment_count_at_recording: 0,
1614 recorded_at_ms: 2,
1615 },
1616 ];
1617 let error = establish_phase1_baseline(&replays, &phase1).expect_err("flaky");
1618 assert!(error.contains("flakiness"));
1619 }
1620
1621 #[test]
1622 fn derive_phase1_metrics_from_run_uses_validator_outputs_and_budget() {
1623 let phase1 = sample_phase1();
1624 let workflow = sample_workflow();
1625 let run = AutomationV2RunRecord {
1626 run_id: "run-1".to_string(),
1627 automation_id: workflow.automation_id.clone(),
1628 tenant_context: tandem_types::TenantContext::local_implicit(),
1629 trigger_type: "manual".to_string(),
1630 status: crate::AutomationRunStatus::Completed,
1631 created_at_ms: 1,
1632 updated_at_ms: 10_000,
1633 started_at_ms: Some(1_000),
1634 finished_at_ms: Some(9_000),
1635 active_session_ids: Vec::new(),
1636 latest_session_id: None,
1637 active_instance_ids: Vec::new(),
1638 checkpoint: crate::AutomationRunCheckpoint {
1639 completed_nodes: vec!["node-1".to_string()],
1640 pending_nodes: Vec::new(),
1641 node_outputs: std::collections::HashMap::from([
1642 (
1643 "node-1".to_string(),
1644 json!({
1645 "validator_summary": {
1646 "outcome": "passed",
1647 "unmet_requirements": []
1648 }
1649 }),
1650 ),
1651 (
1652 "node-2".to_string(),
1653 json!({
1654 "validator_summary": {
1655 "outcome": "blocked",
1656 "unmet_requirements": ["citation_missing", "web_sources_reviewed_missing"]
1657 }
1658 }),
1659 ),
1660 ]),
1661 node_attempts: std::collections::HashMap::new(),
1662 blocked_nodes: vec!["node-2".to_string()],
1663 awaiting_gate: None,
1664 gate_history: Vec::new(),
1665 lifecycle_history: Vec::new(),
1666 last_failure: None,
1667 },
1668 runtime_context: None,
1669 automation_snapshot: Some(workflow.clone()),
1670 pause_reason: None,
1671 resume_reason: None,
1672 detail: None,
1673 stop_kind: None,
1674 stop_reason: None,
1675 prompt_tokens: 0,
1676 completion_tokens: 0,
1677 total_tokens: 100,
1678 estimated_cost_usd: 0.5,
1679 scheduler: None,
1680 trigger_reason: None,
1681 consumed_handoff_id: None,
1682 };
1683 let metrics = derive_phase1_metrics_from_run(&run, &workflow, &phase1).expect("metrics");
1684 assert!((metrics.artifact_validator_pass_rate - 0.5).abs() < 1e-9);
1685 assert!((metrics.unmet_requirement_count - 2.0).abs() < 1e-9);
1686 assert!((metrics.blocked_node_rate - 1.0).abs() < 1e-9);
1687 assert!(metrics.budget_within_limits);
1688 let case_outcomes = derive_phase1_validator_case_outcomes_from_run(&run);
1689 assert_eq!(
1690 case_outcomes.get("node-1").map(String::as_str),
1691 Some("passed")
1692 );
1693 assert_eq!(
1694 case_outcomes.get("node-2").map(String::as_str),
1695 Some("blocked")
1696 );
1697 }
1698
1699 #[test]
1700 fn phase1_baseline_replay_due_requires_initial_replays() {
1701 let phase1 = sample_phase1();
1702 assert!(phase1_baseline_replay_due(&[], 0, &phase1, 0, 0));
1703 assert!(!phase1_baseline_replay_due(&[], 1, &phase1, 0, 0));
1704 }
1705
1706 #[test]
1707 fn phase1_baseline_replay_due_uses_candidate_and_time_intervals() {
1708 let phase1 = sample_phase1();
1709 let replays = vec![
1710 OptimizationBaselineReplayRecord {
1711 replay_id: "replay-1".to_string(),
1712 automation_run_id: None,
1713 phase1_metrics: OptimizationPhase1Metrics {
1714 artifact_validator_pass_rate: 1.0,
1715 unmet_requirement_count: 0.0,
1716 blocked_node_rate: 0.0,
1717 budget_within_limits: true,
1718 },
1719 validator_case_outcomes: std::collections::BTreeMap::from([(
1720 "node-1".to_string(),
1721 "passed".to_string(),
1722 )]),
1723 experiment_count_at_recording: 2,
1724 recorded_at_ms: 1_000,
1725 },
1726 OptimizationBaselineReplayRecord {
1727 replay_id: "replay-2".to_string(),
1728 automation_run_id: None,
1729 phase1_metrics: OptimizationPhase1Metrics {
1730 artifact_validator_pass_rate: 1.0,
1731 unmet_requirement_count: 0.0,
1732 blocked_node_rate: 0.0,
1733 budget_within_limits: true,
1734 },
1735 validator_case_outcomes: std::collections::BTreeMap::from([(
1736 "node-1".to_string(),
1737 "passed".to_string(),
1738 )]),
1739 experiment_count_at_recording: 2,
1740 recorded_at_ms: 1_500,
1741 },
1742 ];
1743 assert!(phase1_baseline_replay_due(&replays, 0, &phase1, 7, 2_000));
1744 assert!(phase1_baseline_replay_due(
1745 &replays, 0, &phase1, 3, 1_801_500
1746 ));
1747 assert!(!phase1_baseline_replay_due(&replays, 0, &phase1, 3, 2_000));
1748 }
1749}