1use std::collections::BTreeMap;
13use std::path::{Path, PathBuf};
14
15use roder_api::inference::{
16 AgentInferenceRequest, EffectiveToolSearchMode, InstructionBundle, ModelSelection,
17 OutputConfig, ReasoningConfig, RuntimeHints, ToolSearchConfig, ToolSearchMode,
18 ToolSearchProviderVariant,
19};
20use roder_api::tools::{ToolChoice, ToolSpec};
21use serde::{Deserialize, Serialize};
22use serde_json::{Value, json};
23
24#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
25#[serde(rename_all = "camelCase")]
26pub struct ToolSearchEvalFixture {
27 pub id: String,
28 pub title: String,
29 pub provider: String,
31 pub model: String,
32 #[serde(default)]
33 pub mode: ToolSearchMode,
34 #[serde(default = "default_true")]
35 pub fallback_to_explicit_tools: bool,
36 #[serde(default)]
37 pub provider_variant: ToolSearchProviderVariant,
38 #[serde(default)]
39 pub catalog: ToolSearchCatalogFixture,
40 #[serde(default, skip_serializing_if = "Option::is_none")]
41 pub search: Option<ToolSearchScript>,
42 pub expected: ToolSearchExpectation,
43}
44
45#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(rename_all = "camelCase")]
47pub struct ToolSearchCatalogFixture {
48 #[serde(default)]
49 pub tools: Vec<ToolSearchCatalogTool>,
50 #[serde(default, skip_serializing_if = "Option::is_none")]
52 pub generated: Option<GeneratedCatalogFixture>,
53 #[serde(default, skip_serializing_if = "Option::is_none")]
54 pub max_items: Option<u32>,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
58#[serde(rename_all = "camelCase")]
59pub struct ToolSearchCatalogTool {
60 pub name: String,
61 pub description: String,
62 #[serde(default, skip_serializing_if = "Option::is_none")]
63 pub parameters: Option<Value>,
64 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
70 pub internal_metadata: BTreeMap<String, String>,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
74#[serde(rename_all = "camelCase")]
75pub struct GeneratedCatalogFixture {
76 pub count: u32,
77 pub name_prefix: String,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
81#[serde(rename_all = "camelCase")]
82pub struct ToolSearchScript {
83 pub query: String,
84 pub results: Value,
86 #[serde(default)]
87 pub denied_tools: Vec<String>,
88}
89
90#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(rename_all = "camelCase")]
92pub struct ToolSearchExpectation {
93 pub outcome: ToolSearchExpectedOutcome,
94 #[serde(default)]
95 pub diagnostic_contains: Vec<String>,
96 #[serde(default, skip_serializing_if = "Option::is_none")]
97 pub catalog_items: Option<usize>,
98 #[serde(default, skip_serializing_if = "Option::is_none")]
99 pub deferred_tools: Option<usize>,
100 #[serde(default, skip_serializing_if = "Option::is_none")]
101 pub native_tool_search_entry: Option<bool>,
102 #[serde(default)]
103 pub executed_tools: Vec<String>,
104 #[serde(default)]
105 pub body_must_not_contain: Vec<String>,
106}
107
108#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
109#[serde(rename_all = "snake_case")]
110pub enum ToolSearchExpectedOutcome {
111 #[default]
113 RequestMapped,
114 Executed,
116 FailClosed,
118}
119
120#[derive(Debug, Clone, PartialEq, Eq)]
121pub enum ToolSearchOutcome {
122 RequestMapped {
123 effective_mode: EffectiveToolSearchMode,
124 body: Value,
125 catalog_items: usize,
126 deferred_tools: usize,
127 native_tool_search_entry: bool,
128 },
129 Executed {
130 executed_tools: Vec<String>,
131 body: Value,
132 },
133 FailClosed {
134 diagnostic: String,
135 },
136}
137
138impl ToolSearchOutcome {
139 pub fn diagnostic(&self) -> Option<&str> {
140 match self {
141 Self::FailClosed { diagnostic } => Some(diagnostic),
142 _ => None,
143 }
144 }
145}
146
147pub fn load_tool_search_fixtures(dir: &Path) -> anyhow::Result<Vec<ToolSearchEvalFixture>> {
148 let mut fixtures = Vec::new();
149 for entry in std::fs::read_dir(dir)? {
150 let path = entry?.path();
151 if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
152 continue;
153 }
154 if path.file_name().and_then(|name| name.to_str()) == Some("catalog_fixture.json") {
156 continue;
157 }
158 let text = std::fs::read_to_string(&path)?;
159 let fixture: ToolSearchEvalFixture = serde_json::from_str(&text)
160 .map_err(|err| anyhow::anyhow!("{}: {err}", path.display()))?;
161 fixtures.push(fixture);
162 }
163 fixtures.sort_by(|left, right| left.id.cmp(&right.id));
164 Ok(fixtures)
165}
166
167pub fn default_tool_search_fixture_dir() -> PathBuf {
168 PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../evals/tool_search")
169}
170
171#[derive(Debug, Clone, Deserialize)]
175#[serde(rename_all = "camelCase")]
176pub struct CatalogAdapterFixture {
177 pub id: String,
178 pub tools: Vec<ToolSpec>,
179 pub expected: CatalogAdapterExpectation,
180}
181
182#[derive(Debug, Clone, Deserialize)]
183#[serde(rename_all = "camelCase")]
184pub struct CatalogAdapterExpectation {
185 pub ids: Vec<String>,
186 pub sources: Vec<String>,
187 pub forbidden_needles: Vec<String>,
188 pub search_query: String,
189 pub search_top_hit: String,
190}
191
192pub fn load_catalog_adapter_fixture() -> anyhow::Result<CatalogAdapterFixture> {
193 let path = default_tool_search_fixture_dir().join("catalog_fixture.json");
194 let text = std::fs::read_to_string(&path)?;
195 serde_json::from_str(&text).map_err(|err| anyhow::anyhow!("{}: {err}", path.display()))
196}
197
198pub fn build_provider_safe_catalog(catalog: &ToolSearchCatalogFixture) -> Vec<ToolSpec> {
204 let mut tools: Vec<ToolSearchCatalogTool> = catalog.tools.clone();
205 if let Some(generated) = &catalog.generated {
206 for index in 0..generated.count {
207 tools.push(ToolSearchCatalogTool {
208 name: format!("{}_{index:04}", generated.name_prefix),
209 description: format!(
210 "Generated catalog tool {index} for {}",
211 generated.name_prefix
212 ),
213 parameters: None,
214 internal_metadata: BTreeMap::new(),
215 });
216 }
217 }
218 tools.sort_by(|left, right| left.name.cmp(&right.name));
219 tools.dedup_by(|left, right| left.name == right.name);
220 if let Some(max_items) = catalog.max_items {
221 tools.truncate(max_items as usize);
222 }
223 tools
224 .into_iter()
225 .map(|tool| ToolSpec {
226 name: tool.name,
227 description: tool.description,
228 parameters: tool
229 .parameters
230 .unwrap_or_else(|| json!({ "type": "object", "properties": {} })),
231 })
232 .collect()
233}
234
235pub fn run_tool_search_fixture(
236 fixture: &ToolSearchEvalFixture,
237) -> anyhow::Result<ToolSearchOutcome> {
238 let catalog = build_provider_safe_catalog(&fixture.catalog);
239 let config = ToolSearchConfig {
240 mode: fixture.mode,
241 max_catalog_items: fixture.catalog.max_items,
242 fallback_to_explicit_tools: fixture.fallback_to_explicit_tools,
243 provider_variant: fixture.provider_variant,
244 ..ToolSearchConfig::default()
245 };
246 let provider_native_supported = match fixture.provider.as_str() {
247 "openai" => roder_ext_openai_responses::openai_model_supports_tool_search(&fixture.model),
248 "anthropic" => roder_ext_anthropic::anthropic_model_supports_tool_search(&fixture.model),
249 other => anyhow::bail!("unsupported fixture provider: {other}"),
250 };
251 let effective_mode = match config.resolve_effective_mode(provider_native_supported) {
252 Ok(mode) => mode,
253 Err(error) => {
254 return Ok(ToolSearchOutcome::FailClosed {
255 diagnostic: error.to_string(),
256 });
257 }
258 };
259
260 let request = inference_request(fixture, &catalog, &config);
261 let body = match fixture.provider.as_str() {
262 "openai" => roder_ext_openai_responses::OpenAiResponsesEngine::map_request(&request),
263 "anthropic" => roder_ext_anthropic::AnthropicEngine::map_request(&request),
264 other => anyhow::bail!("unsupported fixture provider: {other}"),
265 };
266 let (deferred_tools, native_tool_search_entry) = body_tool_search_shape(&body);
267
268 let Some(search) = &fixture.search else {
269 return Ok(ToolSearchOutcome::RequestMapped {
270 effective_mode,
271 body,
272 catalog_items: catalog.len(),
273 deferred_tools,
274 native_tool_search_entry,
275 });
276 };
277
278 match resolve_search_selection(search, &catalog) {
279 Ok(executed_tools) => Ok(ToolSearchOutcome::Executed {
280 executed_tools,
281 body,
282 }),
283 Err(diagnostic) => Ok(ToolSearchOutcome::FailClosed { diagnostic }),
284 }
285}
286
287fn resolve_search_selection(
293 search: &ToolSearchScript,
294 catalog: &[ToolSpec],
295) -> Result<Vec<String>, String> {
296 let Some(results) = search.results.as_array() else {
297 return Err(format!(
298 "malformed provider tool-search results for query {:?}: expected an array of \
299 {{\"name\": ...}} objects, got {}; failing closed without executing any tool",
300 search.query, search.results
301 ));
302 };
303 let mut executed = Vec::new();
304 for result in results {
305 let Some(name) = result.get("name").and_then(Value::as_str) else {
306 return Err(format!(
307 "malformed provider tool-search result entry {result} for query {:?}: missing \
308 string \"name\"; failing closed without executing any tool",
309 search.query
310 ));
311 };
312 if !catalog.iter().any(|tool| tool.name == name) {
313 return Err(format!(
314 "provider selected unknown tool id {name:?} that is not in the provider-safe \
315 catalog; failing closed without executing any tool"
316 ));
317 }
318 if search.denied_tools.iter().any(|denied| denied == name) {
319 return Err(format!(
320 "permission denied for searched tool {name:?}; provider-native tool search does \
321 not bypass Roder permission checks"
322 ));
323 }
324 executed.push(name.to_string());
325 }
326 Ok(executed)
327}
328
329fn inference_request(
330 fixture: &ToolSearchEvalFixture,
331 catalog: &[ToolSpec],
332 config: &ToolSearchConfig,
333) -> AgentInferenceRequest {
334 AgentInferenceRequest {
335 model: ModelSelection {
336 provider: fixture.provider.clone(),
337 model: fixture.model.clone(),
338 },
339 instructions: InstructionBundle {
340 system: Some("offline tool-search eval".to_string()),
341 developer: None,
342 developer_context: None,
343 },
344 transcript: vec![roder_api::transcript::TranscriptItem::UserMessage(
345 roder_api::transcript::UserMessage::text("run the fixture task"),
346 )],
347 tools: catalog.to_vec(),
348 tool_choice: ToolChoice::Auto,
349 reasoning: ReasoningConfig {
350 enabled: false,
351 level: None,
352 },
353 output: OutputConfig {
354 max_tokens: Some(512),
355 temperature: None,
356 top_p: None,
357 response_format: None,
358 },
359 runtime: RuntimeHints {
360 tool_search: config.clone(),
361 ..RuntimeHints::default()
362 },
363 metadata: json!({}),
364 }
365}
366
367fn body_tool_search_shape(body: &Value) -> (usize, bool) {
370 let Some(tools) = body.get("tools").and_then(Value::as_array) else {
371 return (0, false);
372 };
373 let deferred = tools
374 .iter()
375 .filter(|tool| tool.get("defer_loading").and_then(Value::as_bool) == Some(true))
376 .count();
377 let native_entry = tools.iter().any(|tool| {
378 tool.get("type")
379 .and_then(Value::as_str)
380 .is_some_and(|kind| kind == "tool_search" || kind.starts_with("tool_search_tool_"))
381 });
382 (deferred, native_entry)
383}
384
385pub fn assert_tool_search_fixture(fixture: &ToolSearchEvalFixture) -> anyhow::Result<()> {
386 let outcome = run_tool_search_fixture(fixture)?;
387 let expected = &fixture.expected;
388 let body = match &outcome {
389 ToolSearchOutcome::RequestMapped { body, .. }
390 | ToolSearchOutcome::Executed { body, .. } => Some(body.clone()),
391 ToolSearchOutcome::FailClosed { .. } => None,
392 };
393
394 match (expected.outcome, &outcome) {
395 (
396 ToolSearchExpectedOutcome::RequestMapped,
397 ToolSearchOutcome::RequestMapped {
398 catalog_items,
399 deferred_tools,
400 native_tool_search_entry,
401 ..
402 },
403 ) => {
404 if let Some(expected_items) = expected.catalog_items {
405 anyhow::ensure!(
406 *catalog_items == expected_items,
407 "{}: expected {expected_items} catalog items, got {catalog_items}",
408 fixture.id
409 );
410 }
411 if let Some(expected_deferred) = expected.deferred_tools {
412 anyhow::ensure!(
413 *deferred_tools == expected_deferred,
414 "{}: expected {expected_deferred} deferred tools, got {deferred_tools}",
415 fixture.id
416 );
417 }
418 if let Some(expected_entry) = expected.native_tool_search_entry {
419 anyhow::ensure!(
420 *native_tool_search_entry == expected_entry,
421 "{}: expected native tool-search entry = {expected_entry}",
422 fixture.id
423 );
424 }
425 }
426 (
427 ToolSearchExpectedOutcome::Executed,
428 ToolSearchOutcome::Executed { executed_tools, .. },
429 ) => {
430 anyhow::ensure!(
431 *executed_tools == expected.executed_tools,
432 "{}: expected executed tools {:?}, got {executed_tools:?}",
433 fixture.id,
434 expected.executed_tools
435 );
436 }
437 (ToolSearchExpectedOutcome::FailClosed, ToolSearchOutcome::FailClosed { diagnostic }) => {
438 for needle in &expected.diagnostic_contains {
439 anyhow::ensure!(
440 diagnostic.contains(needle),
441 "{}: diagnostic {diagnostic:?} missing {needle:?}",
442 fixture.id
443 );
444 }
445 }
446 (expected_outcome, actual) => anyhow::bail!(
447 "{}: expected outcome {expected_outcome:?}, got {actual:?}",
448 fixture.id
449 ),
450 }
451
452 if let Some(body) = body {
453 let serialized = body.to_string();
454 for marker in &expected.body_must_not_contain {
455 anyhow::ensure!(
456 !serialized.contains(marker),
457 "{}: provider request body leaked redacted marker {marker:?}",
458 fixture.id
459 );
460 }
461 for tool in fixture
462 .catalog
463 .tools
464 .iter()
465 .flat_map(|tool| tool.internal_metadata.values())
466 {
467 anyhow::ensure!(
468 !serialized.contains(tool.as_str()),
469 "{}: provider request body leaked internal metadata value {tool:?}",
470 fixture.id
471 );
472 }
473 }
474 Ok(())
475}
476
477fn default_true() -> bool {
478 true
479}
480
481#[cfg(test)]
482#[path = "tool_search_tests.rs"]
483mod tool_search_tests;