use super::*;
use awaken_eval::{Expectation, ReplayReport};
#[test]
fn cell_timeout_outcome_reports_as_failed() {
let expect = Expectation::default();
let (outcome, failures) = cell_timeout_outcome("fx".into(), 5, &expect);
assert!(outcome.runtime_failure.is_some());
assert!(!failures.is_empty(), "timeout must produce failures");
let report = ReplayReport::from_outcome(&outcome, failures);
assert!(!report.passed, "timeout cell must report passed=false");
assert!(
report
.failures
.iter()
.any(|f| matches!(f, awaken_eval::Failure::ReplayRuntimeFailure { .. })),
"expected ReplayRuntimeFailure in failures: {:?}",
report.failures
);
}
#[test]
fn cell_error_outcome_reports_as_failed() {
let expect = Expectation::default();
let outcome = ReplayOutcome {
fixture_id: "fx".into(),
final_text: String::new(),
metrics: awaken_ext_observability::AgentMetrics::default(),
elapsed: std::time::Duration::ZERO,
error_type: None,
inference_error_count: 0,
runtime_failure: None,
revision_count: 0,
judge_score: None,
judge_reasoning: None,
};
let (outcome, failures) =
cell_error_outcome(outcome, "judge returned non-JSON".into(), &expect);
let report = ReplayReport::from_outcome(&outcome, failures);
assert!(!report.passed);
assert!(report.runtime_failure.is_some());
}
#[test]
fn cell_error_outcome_preserves_real_outcome_data() {
use awaken_eval::Failure;
use awaken_ext_observability::{AgentMetrics, GenAISpan, SpanContext};
let expect = Expectation {
final_answer_contains: vec!["42".into()],
..Expectation::default()
};
let mut inf_span = GenAISpan {
context: SpanContext {
run_id: "RUN-REAL".into(),
..SpanContext::default()
},
step_index: None,
model: "m".into(),
provider: "p".into(),
operation: "chat".into(),
response_model: None,
response_id: None,
finish_reasons: Vec::new(),
error_type: None,
error_class: None,
thinking_tokens: None,
input_tokens: Some(10),
output_tokens: Some(20),
total_tokens: Some(30),
cache_read_input_tokens: None,
cache_creation_input_tokens: None,
temperature: None,
top_p: None,
max_tokens: None,
stop_sequences: Vec::new(),
duration_ms: 5,
started_at_ms: 0,
ended_at_ms: 5,
response_content: None,
response_tool_calls: None,
request_messages: None,
};
inf_span.context.run_id = "RUN-REAL".into();
let metrics = AgentMetrics {
inferences: vec![inf_span],
session_duration_ms: 42,
..Default::default()
};
let real_outcome = ReplayOutcome {
fixture_id: "fx".into(),
final_text: "the answer is 42".into(),
metrics,
elapsed: std::time::Duration::from_millis(123),
error_type: None,
inference_error_count: 0,
runtime_failure: None,
revision_count: 2,
judge_score: None,
judge_reasoning: None,
};
let (outcome, failures) = cell_error_outcome(
real_outcome,
"scoring failed: judge timeout".into(),
&expect,
);
assert!(
!failures
.iter()
.any(|f| matches!(f, Failure::AnswerMissingPhrase { .. })),
"must not fabricate AnswerMissingPhrase from a blanked final_text: {failures:?}",
);
assert!(matches!(
outcome.runtime_failure,
Some(awaken_eval::ReplayRuntimeFailure::RuntimeError { .. })
));
let report = ReplayReport::from_outcome(&outcome, failures);
assert!(!report.passed);
assert_eq!(report.final_text, "the answer is 42");
assert_eq!(report.total_input_tokens, 10);
assert_eq!(report.total_output_tokens, 20);
assert_eq!(report.total_tokens, 30);
assert_eq!(report.inference_count, 1);
assert_eq!(report.session_duration_ms, 42);
assert_eq!(report.elapsed_ms, 123);
assert_eq!(report.revision_count, 2);
assert_eq!(outcome.trace_run_id(), Some("RUN-REAL"));
assert!(report.runtime_failure.is_some());
}
#[test]
fn cell_error_outcome_preserves_existing_runtime_failure() {
use awaken_eval::{Failure, ReplayRuntimeFailure};
let expect = Expectation::default();
let mut real_outcome = ReplayOutcome {
fixture_id: "fx".into(),
final_text: "partial reply".into(),
metrics: awaken_ext_observability::AgentMetrics::default(),
elapsed: std::time::Duration::from_millis(50),
error_type: None,
inference_error_count: 0,
runtime_failure: None,
revision_count: 0,
judge_score: None,
judge_reasoning: None,
};
real_outcome.runtime_failure = Some(ReplayRuntimeFailure::RuntimeError {
message: "token budget exceeded".into(),
});
let (outcome, failures) = cell_error_outcome(
real_outcome,
"scoring failed: judge returned non-JSON".into(),
&expect,
);
match &outcome.runtime_failure {
Some(ReplayRuntimeFailure::RuntimeError { message }) => {
assert_eq!(message, "token budget exceeded");
}
other => panic!("expected preserved RuntimeError, got {other:?}"),
}
let runtime_failure_messages: Vec<String> = failures
.iter()
.filter_map(|f| match f {
Failure::ReplayRuntimeFailure {
failure: ReplayRuntimeFailure::RuntimeError { message },
} => Some(message.clone()),
_ => None,
})
.collect();
assert!(
runtime_failure_messages
.iter()
.any(|m| m == "token budget exceeded"),
"expected original replay runtime_failure to remain in failures list: {runtime_failure_messages:?}",
);
assert!(
runtime_failure_messages
.iter()
.any(|m| m == "scoring failed: judge returned non-JSON"),
"expected scoring error to be appended to failures list: {runtime_failure_messages:?}",
);
}
#[tokio::test]
async fn score_outcome_returns_judge_result_so_caller_can_stamp_outcome() {
use awaken_eval::test_support::ScriptedExecutor;
use awaken_eval::{Failure, Fixture, LlmExecutorJudge, MockResponse};
let judge_executor = ScriptedExecutor::new(
"judge-stub",
vec![r#"{"score": 0.91, "reasoning": "thorough"}"#],
)
.arc();
let context = JudgeContext {
judge: LlmExecutorJudge::new(judge_executor, "judge-model"),
rubric: None,
revise_max_retries: None,
};
let fixture = Fixture {
id: "fx".into(),
description: None,
user_input: "explain it".into(),
provider_script: vec![],
provider_script_error: None,
source_run_id: None,
source_model_id: None,
allow_unused_provider_script: false,
mock_response: MockResponse::default(),
expect: Expectation {
min_judge_score: Some(0.7),
..Expectation::default()
},
continued_turns: vec![],
};
let outcome = ReplayOutcome {
fixture_id: "fx".into(),
final_text: "agent answer".into(),
metrics: awaken_ext_observability::AgentMetrics::default(),
elapsed: std::time::Duration::from_millis(10),
error_type: None,
inference_error_count: 0,
runtime_failure: None,
revision_count: 0,
judge_score: None,
judge_reasoning: None,
};
let (failures, judge_result) = score_outcome(&outcome, &fixture, Some(&context))
.await
.expect("judge invocation succeeded");
assert!(
!failures
.iter()
.any(|f| matches!(f, Failure::JudgeBelowThreshold { .. })),
"0.91 is above threshold; no JudgeBelowThreshold should fire"
);
let jr = judge_result.expect("score_outcome must surface the JudgeResult to the caller");
assert!((jr.score - 0.91).abs() < f32::EPSILON);
assert_eq!(jr.reasoning.as_deref(), Some("thorough"));
let mut outcome = outcome;
outcome.judge_score = Some(jr.score);
outcome.judge_reasoning = jr.reasoning;
let report = ReplayReport::from_outcome(&outcome, failures);
assert_eq!(report.judge_score, Some(0.91));
assert_eq!(report.judge_reasoning.as_deref(), Some("thorough"));
assert!(report.passed, "above threshold and no other failures");
}
#[test]
fn revise_tuple_for_treats_zero_retries_as_disabled() {
use awaken_eval::LlmExecutorJudge;
use awaken_eval::test_support::ScriptedExecutor;
let context = JudgeContext {
judge: LlmExecutorJudge::new(ScriptedExecutor::new("judge", vec!["{}"]).arc(), "judge"),
rubric: Some("grade".into()),
revise_max_retries: Some(0),
};
let expect = Expectation {
min_judge_score: Some(0.8),
..Expectation::default()
};
assert!(
revise_tuple_for(Some(&context), &expect).is_none(),
"0 retries means no replay-phase revise loop; judge remains in scoring phase"
);
}
#[tokio::test]
async fn zero_retry_judge_timeout_preserves_replay_outcome() {
use async_trait::async_trait;
use awaken_eval::test_support::ScriptedExecutor;
use awaken_eval::{Fixture, LlmExecutorJudge, MatrixCell, MockResponse, ReplayRuntimeFailure};
use awaken_server_contract::contract::content::ContentBlock;
use awaken_server_contract::contract::executor::{
InferenceExecutionError, InferenceRequest, LlmExecutor,
};
use awaken_server_contract::contract::inference::{StopReason, StreamResult, TokenUsage};
use awaken_server_contract::registry_spec::ModelSpec;
struct SlowJudgeExecutor;
#[async_trait]
impl LlmExecutor for SlowJudgeExecutor {
async fn execute(
&self,
_request: InferenceRequest,
) -> Result<StreamResult, InferenceExecutionError> {
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
Ok(StreamResult {
content: vec![ContentBlock::text(r#"{"score":1.0,"reasoning":"late"}"#)],
tool_calls: vec![],
usage: Some(TokenUsage::default()),
stop_reason: Some(StopReason::EndTurn),
has_incomplete_tool_calls: false,
})
}
fn name(&self) -> &str {
"slow-judge"
}
}
let fixture = Fixture {
id: "fx".into(),
description: None,
user_input: "say hi".into(),
provider_script: vec![],
provider_script_error: None,
source_run_id: None,
source_model_id: None,
allow_unused_provider_script: false,
mock_response: MockResponse::default(),
expect: Expectation {
final_answer_contains: vec!["hello".into()],
min_judge_score: Some(0.8),
..Expectation::default()
},
continued_turns: vec![],
};
let live_executor = ScriptedExecutor::new("live", vec!["hello"])
.with_tokens(TokenUsage {
prompt_tokens: Some(1),
completion_tokens: Some(1),
total_tokens: Some(2),
..Default::default()
})
.arc();
let resolved = ResolvedCell {
cell: MatrixCell {
model_id: Some("m".into()),
},
executor: live_executor,
upstream_model: "upstream".into(),
spec: ModelSpec::new("m", "provider", "upstream"),
};
let judge = JudgeContext {
judge: LlmExecutorJudge::new(std::sync::Arc::new(SlowJudgeExecutor), "judge-model"),
rubric: Some("grade correctness".into()),
revise_max_retries: Some(0),
};
let items = run_live_eval_cells(
&[fixture],
&[resolved],
LiveCellOptions {
samples: 1,
max_concurrent: 1,
max_walltime_secs: 1,
agent_base: None,
agent_overrides: None,
judge: Some(judge),
max_total_tokens: None,
trace_sink: None,
trace_store: None,
task_context: "test cell",
},
)
.await
.unwrap();
let report = &items[0].report;
assert_eq!(report.final_text, "hello");
assert_eq!(report.total_tokens, 2);
assert!(!report.passed);
assert!(matches!(
report.runtime_failure,
Some(ReplayRuntimeFailure::RuntimeError { ref message })
if message.contains("scoring timed out")
));
}
#[tokio::test]
async fn run_live_eval_cells_enforces_shared_token_budget() {
use awaken_eval::test_support::ScriptedExecutor;
use awaken_eval::{Fixture, MatrixCell, MockResponse, ReplayRuntimeFailure};
use awaken_server_contract::contract::inference::TokenUsage;
use awaken_server_contract::registry_spec::ModelSpec;
let fixture = Fixture {
id: "fx".into(),
description: None,
user_input: "say hi".into(),
provider_script: vec![],
provider_script_error: None,
source_run_id: None,
source_model_id: None,
allow_unused_provider_script: false,
mock_response: MockResponse::default(),
expect: Expectation::default(),
continued_turns: vec![],
};
let executor = ScriptedExecutor::new("live", vec!["hello"])
.with_tokens(TokenUsage {
prompt_tokens: Some(3),
completion_tokens: Some(2),
total_tokens: Some(5),
..Default::default()
})
.arc();
let resolved = ResolvedCell {
cell: MatrixCell {
model_id: Some("m".into()),
},
executor,
upstream_model: "upstream".into(),
spec: ModelSpec::new("m", "provider", "upstream"),
};
let items = run_live_eval_cells(
&[fixture],
&[resolved],
LiveCellOptions {
samples: 1,
max_concurrent: 1,
max_walltime_secs: 5,
agent_base: None,
agent_overrides: None,
judge: None,
max_total_tokens: Some(1),
trace_sink: None,
trace_store: None,
task_context: "test cell",
},
)
.await
.unwrap();
assert_eq!(items.len(), 1);
let report = &items[0].report;
assert!(!report.passed);
assert_eq!(report.total_tokens, 5);
assert!(matches!(
report.runtime_failure,
Some(ReplayRuntimeFailure::RuntimeError { ref message })
if message.contains("token budget exceeded")
));
}