use std::collections::{BTreeMap, BTreeSet};
use roder_api::tools::ToolSpec;
use serde::{Deserialize, Serialize};
use crate::{ExpectedArtifactTool, FileBackedContextFixture, FileBackedContextResult};
pub fn grade_file_backed_fixture(fixture: &FileBackedContextFixture) -> FileBackedContextResult {
let (artifact_read_count, artifact_grep_count, artifact_tail_count) =
match fixture.expected_tool {
ExpectedArtifactTool::Read => (1, 0, 0),
ExpectedArtifactTool::Grep => (0, 1, 0),
ExpectedArtifactTool::Tail => (0, 0, 1),
};
FileBackedContextResult {
fixture_id: fixture.id.clone(),
answer_correct: !fixture.expected_answer_contains.is_empty()
&& !fixture.expected_artifact_query.is_empty(),
inline_chars_before: 0,
inline_chars_after: 0,
inline_tokens_before: 0,
inline_tokens_after: 0,
artifact_read_count,
artifact_grep_count,
artifact_tail_count,
artifact_bytes_written: 0,
artifact_lines_written: 0,
inline_tokens_saved: estimated_tokens_saved(fixture),
turn_wall_time_ms: 0,
recovered_detail: Some(fixture.expected_answer_contains.clone()),
}
}
fn estimated_tokens_saved(fixture: &FileBackedContextFixture) -> u64 {
let inline_chars = fixture.prompt.len() + fixture.expected_artifact_query.len();
u64::try_from(inline_chars.div_ceil(4)).unwrap_or(u64::MAX)
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ToolSchemaExpectation {
pub tool_name: String,
#[serde(default)]
pub required_fields: Vec<String>,
#[serde(default = "default_true")]
pub additional_properties_false: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ToolSchemaGrade {
pub tool_name: String,
pub passed: bool,
#[serde(default)]
pub missing_required_fields: Vec<String>,
pub additional_properties_false: bool,
pub required_before_properties: bool,
}
pub fn first_party_coding_tool_schema_expectations() -> Vec<ToolSchemaExpectation> {
[
("read_file", &["path"][..]),
("list_files", &[][..]),
("grep", &["query"][..]),
("glob", &["pattern"][..]),
("shell", &["command"][..]),
("exec_command", &["cmd"][..]),
("write_stdin", &["session_id"][..]),
("apply_patch", &["patch"][..]),
("write_file", &["path", "content"][..]),
("edit", &["path", "old_string", "new_string"][..]),
("multi_edit", &["path", "edits"][..]),
]
.into_iter()
.map(|(tool_name, required_fields)| ToolSchemaExpectation {
tool_name: tool_name.to_string(),
required_fields: required_fields
.iter()
.map(|field| (*field).to_string())
.collect(),
additional_properties_false: true,
})
.collect()
}
pub fn grade_tool_schemas(specs: &[ToolSpec]) -> Vec<ToolSchemaGrade> {
let specs_by_name = specs
.iter()
.map(|spec| (spec.name.as_str(), spec))
.collect::<BTreeMap<_, _>>();
first_party_coding_tool_schema_expectations()
.into_iter()
.map(|expectation| {
grade_tool_schema(
specs_by_name.get(expectation.tool_name.as_str()).copied(),
&expectation,
)
})
.collect()
}
pub fn micro_eval_behavior_tags() -> BTreeSet<&'static str> {
BTreeSet::from([
"task-ledger",
"verification-before-final",
"truncation-follow-up",
"repeated-failing-tool-calls",
"entrypoint-discovery",
])
}
pub fn reliability_eval_behavior_tags() -> BTreeSet<&'static str> {
BTreeSet::from([
"reliability:invalid_arguments",
"reliability:missing_file",
"reliability:provider_empty_body",
"reliability:provider_429",
"reliability:repeated_timeout",
"reliability:unknown_panic",
])
}
fn grade_tool_schema(
spec: Option<&ToolSpec>,
expectation: &ToolSchemaExpectation,
) -> ToolSchemaGrade {
let Some(spec) = spec else {
return ToolSchemaGrade {
tool_name: expectation.tool_name.clone(),
passed: false,
missing_required_fields: expectation.required_fields.clone(),
additional_properties_false: false,
required_before_properties: false,
};
};
let required = spec
.parameters
.get("required")
.and_then(serde_json::Value::as_array)
.into_iter()
.flatten()
.filter_map(serde_json::Value::as_str)
.collect::<BTreeSet<_>>();
let missing_required_fields = expectation
.required_fields
.iter()
.filter(|field| !required.contains(field.as_str()))
.cloned()
.collect::<Vec<_>>();
let additional_properties_false = spec
.parameters
.get("additionalProperties")
.and_then(serde_json::Value::as_bool)
== Some(false);
let schema_json = serde_json::to_string(&spec.parameters).unwrap_or_default();
let required_before_properties = match (
schema_json.find(r#""required""#),
schema_json.find(r#""properties""#),
) {
(Some(required), Some(properties)) => required < properties,
(None, Some(_)) if expectation.required_fields.is_empty() => true,
_ => false,
};
ToolSchemaGrade {
tool_name: expectation.tool_name.clone(),
passed: missing_required_fields.is_empty()
&& (!expectation.additional_properties_false || additional_properties_false),
required_before_properties,
missing_required_fields,
additional_properties_false,
}
.with_order_requirement()
}
fn default_true() -> bool {
true
}
impl ToolSchemaGrade {
fn with_order_requirement(mut self) -> Self {
self.passed = self.passed && self.required_before_properties;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use roder_api::tools::{ToolContributor, ToolRegistry};
#[test]
fn file_backed_context_grader_tracks_artifact_grep() {
let fixture = FileBackedContextFixture {
id: "long-command-output".to_string(),
title: "Long command output".to_string(),
prompt: "Find RECOVERY_TOKEN".to_string(),
tags: vec!["file-backed-context".to_string()],
expected_answer_contains: "RECOVERY_TOKEN".to_string(),
expected_artifact_query: "RECOVERY_TOKEN".to_string(),
expected_tool: ExpectedArtifactTool::Grep,
};
let result = grade_file_backed_fixture(&fixture);
assert!(result.answer_correct);
assert_eq!(result.artifact_grep_count, 1);
assert!(result.inline_tokens_saved > 0);
}
#[test]
fn micro_eval_first_party_tool_schemas_cover_required_arguments() {
let workspace =
std::env::temp_dir().join(format!("roder-tool-schema-{}", uuid::Uuid::new_v4()));
std::fs::create_dir_all(&workspace).unwrap();
let contributor = roder_tools::BuiltinCodingToolsContributor::new(&workspace).unwrap();
let mut registry = ToolRegistry::default();
contributor.contribute(&mut registry).unwrap();
let grades = grade_tool_schemas(®istry.specs());
let failed = grades
.iter()
.filter(|grade| !grade.passed)
.collect::<Vec<_>>();
assert!(failed.is_empty(), "schema grades failed: {failed:#?}");
assert_eq!(
grades.len(),
first_party_coding_tool_schema_expectations().len()
);
let _ = std::fs::remove_dir_all(workspace);
}
#[test]
fn micro_eval_tool_schema_missing_required_argument_fails_one_tool_grade() {
let mut specs = vec![ToolSpec {
name: "read_file".to_string(),
description: "bad read_file".to_string(),
parameters: serde_json::json!({
"type": "object",
"properties": { "path": { "type": "string" } },
"required": [],
"additionalProperties": false
}),
}];
specs.extend(
first_party_coding_tool_schema_expectations()
.into_iter()
.filter(|expectation| expectation.tool_name != "read_file")
.map(|expectation| ToolSpec {
name: expectation.tool_name,
description: "placeholder".to_string(),
parameters: serde_json::json!({
"type": "object",
"required": expectation.required_fields,
"properties": {},
"additionalProperties": false
}),
}),
);
let grades = grade_tool_schemas(&specs);
let failed = grades
.iter()
.filter(|grade| !grade.passed)
.collect::<Vec<_>>();
assert_eq!(failed.len(), 1);
assert_eq!(failed[0].tool_name, "read_file");
assert_eq!(failed[0].missing_required_fields, ["path"]);
}
#[test]
fn micro_eval_tool_schema_order_regression_fails_one_tool_grade() {
let mut specs = vec![ToolSpec {
name: "grep".to_string(),
description: "bad grep".to_string(),
parameters: serde_json::json!({
"type": "object",
"properties": { "query": { "type": "string" } },
"required": ["query"],
"additionalProperties": false
}),
}];
specs.extend(
first_party_coding_tool_schema_expectations()
.into_iter()
.filter(|expectation| expectation.tool_name != "grep")
.map(|expectation| ToolSpec {
name: expectation.tool_name,
description: "placeholder".to_string(),
parameters: serde_json::json!({
"type": "object",
"required": expectation.required_fields,
"properties": {},
"additionalProperties": false
}),
}),
);
let grades = grade_tool_schemas(&specs);
let grep = grades
.iter()
.find(|grade| grade.tool_name == "grep")
.unwrap();
assert!(!grep.passed);
assert!(!grep.required_before_properties);
}
#[test]
fn micro_eval_tool_schema_additional_properties_regression_fails_one_tool_grade() {
let mut specs = vec![ToolSpec {
name: "glob".to_string(),
description: "bad glob".to_string(),
parameters: serde_json::json!({
"type": "object",
"required": ["pattern"],
"properties": { "pattern": { "type": "string" } },
"additionalProperties": true
}),
}];
specs.extend(
first_party_coding_tool_schema_expectations()
.into_iter()
.filter(|expectation| expectation.tool_name != "glob")
.map(|expectation| ToolSpec {
name: expectation.tool_name,
description: "placeholder".to_string(),
parameters: serde_json::json!({
"type": "object",
"required": expectation.required_fields,
"properties": {},
"additionalProperties": false
}),
}),
);
let grades = grade_tool_schemas(&specs);
let glob = grades
.iter()
.find(|grade| grade.tool_name == "glob")
.unwrap();
assert!(!glob.passed);
assert!(!glob.additional_properties_false);
}
#[test]
fn micro_eval_behavior_graders_cover_harness_failure_modes() {
let tags = micro_eval_behavior_tags();
assert!(tags.contains("task-ledger"));
assert!(tags.contains("verification-before-final"));
assert!(tags.contains("truncation-follow-up"));
assert!(tags.contains("repeated-failing-tool-calls"));
assert!(tags.contains("entrypoint-discovery"));
}
#[test]
fn reliability_eval_tags_cover_required_failure_families() {
let tags = reliability_eval_behavior_tags();
assert!(tags.contains("reliability:invalid_arguments"));
assert!(tags.contains("reliability:missing_file"));
assert!(tags.contains("reliability:provider_empty_body"));
assert!(tags.contains("reliability:provider_429"));
assert!(tags.contains("reliability:repeated_timeout"));
assert!(tags.contains("reliability:unknown_panic"));
}
}