Skip to main content

roder_evals/
tool_search.rs

1//! Offline provider-native tool-search eval harness (roadmap phase 79).
2//!
3//! Fixtures under `evals/tool_search/` describe a searchable tool catalog,
4//! the requested tool-search mode, and a scripted provider search/selection
5//! outcome. The harness builds the provider-safe catalog, resolves the
6//! effective mode through the canonical `ToolSearchConfig` contract, maps the
7//! request body through the real OpenAI Responses / Anthropic Messages
8//! mappers, and simulates searched-tool selection so error cases (unknown
9//! tool ids, malformed results, denied permissions, redaction) fail closed
10//! with actionable diagnostics.
11
12use std::collections::BTreeMap;
13use std::path::{Path, PathBuf};
14
15use roder_api::inference::{
16    AgentInferenceRequest, EffectiveToolSearchMode, InstructionBundle, ModelSelection,
17    OutputConfig, ReasoningConfig, RuntimeHints, ToolSearchConfig, ToolSearchMode,
18    ToolSearchProviderVariant,
19};
20use roder_api::tools::{ToolChoice, ToolSpec};
21use serde::{Deserialize, Serialize};
22use serde_json::{Value, json};
23
24#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
25#[serde(rename_all = "camelCase")]
26pub struct ToolSearchEvalFixture {
27    pub id: String,
28    pub title: String,
29    /// `openai` or `anthropic`.
30    pub provider: String,
31    pub model: String,
32    #[serde(default)]
33    pub mode: ToolSearchMode,
34    #[serde(default = "default_true")]
35    pub fallback_to_explicit_tools: bool,
36    #[serde(default)]
37    pub provider_variant: ToolSearchProviderVariant,
38    #[serde(default)]
39    pub catalog: ToolSearchCatalogFixture,
40    #[serde(default, skip_serializing_if = "Option::is_none")]
41    pub search: Option<ToolSearchScript>,
42    pub expected: ToolSearchExpectation,
43}
44
45#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
46#[serde(rename_all = "camelCase")]
47pub struct ToolSearchCatalogFixture {
48    #[serde(default)]
49    pub tools: Vec<ToolSearchCatalogTool>,
50    /// Synthesizes `count` extra tools so large-catalog fixtures stay small.
51    #[serde(default, skip_serializing_if = "Option::is_none")]
52    pub generated: Option<GeneratedCatalogFixture>,
53    #[serde(default, skip_serializing_if = "Option::is_none")]
54    pub max_items: Option<u32>,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
58#[serde(rename_all = "camelCase")]
59pub struct ToolSearchCatalogTool {
60    pub name: String,
61    pub description: String,
62    #[serde(default, skip_serializing_if = "Option::is_none")]
63    pub parameters: Option<Value>,
64    /**
65     * Internal-only metadata (credentials, auth headers, local paths) that
66     * the provider-safe catalog must never forward. Values listed here must
67     * not appear anywhere in the serialized provider request body.
68     */
69    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
70    pub internal_metadata: BTreeMap<String, String>,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
74#[serde(rename_all = "camelCase")]
75pub struct GeneratedCatalogFixture {
76    pub count: u32,
77    pub name_prefix: String,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
81#[serde(rename_all = "camelCase")]
82pub struct ToolSearchScript {
83    pub query: String,
84    /// Raw provider search-results payload; may be intentionally malformed.
85    pub results: Value,
86    #[serde(default)]
87    pub denied_tools: Vec<String>,
88}
89
90#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
91#[serde(rename_all = "camelCase")]
92pub struct ToolSearchExpectation {
93    pub outcome: ToolSearchExpectedOutcome,
94    #[serde(default)]
95    pub diagnostic_contains: Vec<String>,
96    #[serde(default, skip_serializing_if = "Option::is_none")]
97    pub catalog_items: Option<usize>,
98    #[serde(default, skip_serializing_if = "Option::is_none")]
99    pub deferred_tools: Option<usize>,
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub native_tool_search_entry: Option<bool>,
102    #[serde(default)]
103    pub executed_tools: Vec<String>,
104    #[serde(default)]
105    pub body_must_not_contain: Vec<String>,
106}
107
108#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
109#[serde(rename_all = "snake_case")]
110pub enum ToolSearchExpectedOutcome {
111    /// The request mapped (explicit or provider-native) without selection.
112    #[default]
113    RequestMapped,
114    /// Scripted search selection resolved and executed permitted tools.
115    Executed,
116    /// The turn failed closed with an actionable diagnostic.
117    FailClosed,
118}
119
120#[derive(Debug, Clone, PartialEq, Eq)]
121pub enum ToolSearchOutcome {
122    RequestMapped {
123        effective_mode: EffectiveToolSearchMode,
124        body: Value,
125        catalog_items: usize,
126        deferred_tools: usize,
127        native_tool_search_entry: bool,
128    },
129    Executed {
130        executed_tools: Vec<String>,
131        body: Value,
132    },
133    FailClosed {
134        diagnostic: String,
135    },
136}
137
138impl ToolSearchOutcome {
139    pub fn diagnostic(&self) -> Option<&str> {
140        match self {
141            Self::FailClosed { diagnostic } => Some(diagnostic),
142            _ => None,
143        }
144    }
145}
146
147pub fn load_tool_search_fixtures(dir: &Path) -> anyhow::Result<Vec<ToolSearchEvalFixture>> {
148    let mut fixtures = Vec::new();
149    for entry in std::fs::read_dir(dir)? {
150        let path = entry?.path();
151        if path.extension().and_then(|ext| ext.to_str()) != Some("json") {
152            continue;
153        }
154        // The catalog-adapter snapshot has its own schema and test.
155        if path.file_name().and_then(|name| name.to_str()) == Some("catalog_fixture.json") {
156            continue;
157        }
158        let text = std::fs::read_to_string(&path)?;
159        let fixture: ToolSearchEvalFixture = serde_json::from_str(&text)
160            .map_err(|err| anyhow::anyhow!("{}: {err}", path.display()))?;
161        fixtures.push(fixture);
162    }
163    fixtures.sort_by(|left, right| left.id.cmp(&right.id));
164    Ok(fixtures)
165}
166
167pub fn default_tool_search_fixture_dir() -> PathBuf {
168    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../evals/tool_search")
169}
170
171/// Catalog-adapter snapshot fixture (phase 79 Task 2): a fixed toolset with
172/// the expected stable catalog ids, source classification, redaction
173/// needles, and a ranked search expectation.
174#[derive(Debug, Clone, Deserialize)]
175#[serde(rename_all = "camelCase")]
176pub struct CatalogAdapterFixture {
177    pub id: String,
178    pub tools: Vec<ToolSpec>,
179    pub expected: CatalogAdapterExpectation,
180}
181
182#[derive(Debug, Clone, Deserialize)]
183#[serde(rename_all = "camelCase")]
184pub struct CatalogAdapterExpectation {
185    pub ids: Vec<String>,
186    pub sources: Vec<String>,
187    pub forbidden_needles: Vec<String>,
188    pub search_query: String,
189    pub search_top_hit: String,
190}
191
192pub fn load_catalog_adapter_fixture() -> anyhow::Result<CatalogAdapterFixture> {
193    let path = default_tool_search_fixture_dir().join("catalog_fixture.json");
194    let text = std::fs::read_to_string(&path)?;
195    serde_json::from_str(&text).map_err(|err| anyhow::anyhow!("{}: {err}", path.display()))
196}
197
198/**
199 * Build the provider-safe searchable catalog: deterministic name ordering,
200 * duplicate-name removal, max-item limiting, and redaction of internal-only
201 * metadata. Only name/description/parameters ever reach the provider.
202 */
203pub fn build_provider_safe_catalog(catalog: &ToolSearchCatalogFixture) -> Vec<ToolSpec> {
204    let mut tools: Vec<ToolSearchCatalogTool> = catalog.tools.clone();
205    if let Some(generated) = &catalog.generated {
206        for index in 0..generated.count {
207            tools.push(ToolSearchCatalogTool {
208                name: format!("{}_{index:04}", generated.name_prefix),
209                description: format!(
210                    "Generated catalog tool {index} for {}",
211                    generated.name_prefix
212                ),
213                parameters: None,
214                internal_metadata: BTreeMap::new(),
215            });
216        }
217    }
218    tools.sort_by(|left, right| left.name.cmp(&right.name));
219    tools.dedup_by(|left, right| left.name == right.name);
220    if let Some(max_items) = catalog.max_items {
221        tools.truncate(max_items as usize);
222    }
223    tools
224        .into_iter()
225        .map(|tool| ToolSpec {
226            name: tool.name,
227            description: tool.description,
228            parameters: tool
229                .parameters
230                .unwrap_or_else(|| json!({ "type": "object", "properties": {} })),
231        })
232        .collect()
233}
234
235pub fn run_tool_search_fixture(
236    fixture: &ToolSearchEvalFixture,
237) -> anyhow::Result<ToolSearchOutcome> {
238    let catalog = build_provider_safe_catalog(&fixture.catalog);
239    let config = ToolSearchConfig {
240        mode: fixture.mode,
241        max_catalog_items: fixture.catalog.max_items,
242        fallback_to_explicit_tools: fixture.fallback_to_explicit_tools,
243        provider_variant: fixture.provider_variant,
244        ..ToolSearchConfig::default()
245    };
246    let provider_native_supported = match fixture.provider.as_str() {
247        "openai" => roder_ext_openai_responses::openai_model_supports_tool_search(&fixture.model),
248        "anthropic" => roder_ext_anthropic::anthropic_model_supports_tool_search(&fixture.model),
249        other => anyhow::bail!("unsupported fixture provider: {other}"),
250    };
251    let effective_mode = match config.resolve_effective_mode(provider_native_supported) {
252        Ok(mode) => mode,
253        Err(error) => {
254            return Ok(ToolSearchOutcome::FailClosed {
255                diagnostic: error.to_string(),
256            });
257        }
258    };
259
260    let request = inference_request(fixture, &catalog, &config);
261    let body = match fixture.provider.as_str() {
262        "openai" => roder_ext_openai_responses::OpenAiResponsesEngine::map_request(&request),
263        "anthropic" => roder_ext_anthropic::AnthropicEngine::map_request(&request),
264        other => anyhow::bail!("unsupported fixture provider: {other}"),
265    };
266    let (deferred_tools, native_tool_search_entry) = body_tool_search_shape(&body);
267
268    let Some(search) = &fixture.search else {
269        return Ok(ToolSearchOutcome::RequestMapped {
270            effective_mode,
271            body,
272            catalog_items: catalog.len(),
273            deferred_tools,
274            native_tool_search_entry,
275        });
276    };
277
278    match resolve_search_selection(search, &catalog) {
279        Ok(executed_tools) => Ok(ToolSearchOutcome::Executed {
280            executed_tools,
281            body,
282        }),
283        Err(diagnostic) => Ok(ToolSearchOutcome::FailClosed { diagnostic }),
284    }
285}
286
287/**
288 * Resolve a scripted provider search-result payload into executed catalog
289 * tools, failing closed for malformed payloads, unknown selected tool ids,
290 * and permission-denied selections.
291 */
292fn resolve_search_selection(
293    search: &ToolSearchScript,
294    catalog: &[ToolSpec],
295) -> Result<Vec<String>, String> {
296    let Some(results) = search.results.as_array() else {
297        return Err(format!(
298            "malformed provider tool-search results for query {:?}: expected an array of \
299             {{\"name\": ...}} objects, got {}; failing closed without executing any tool",
300            search.query, search.results
301        ));
302    };
303    let mut executed = Vec::new();
304    for result in results {
305        let Some(name) = result.get("name").and_then(Value::as_str) else {
306            return Err(format!(
307                "malformed provider tool-search result entry {result} for query {:?}: missing \
308                 string \"name\"; failing closed without executing any tool",
309                search.query
310            ));
311        };
312        if !catalog.iter().any(|tool| tool.name == name) {
313            return Err(format!(
314                "provider selected unknown tool id {name:?} that is not in the provider-safe \
315                 catalog; failing closed without executing any tool"
316            ));
317        }
318        if search.denied_tools.iter().any(|denied| denied == name) {
319            return Err(format!(
320                "permission denied for searched tool {name:?}; provider-native tool search does \
321                 not bypass Roder permission checks"
322            ));
323        }
324        executed.push(name.to_string());
325    }
326    Ok(executed)
327}
328
329fn inference_request(
330    fixture: &ToolSearchEvalFixture,
331    catalog: &[ToolSpec],
332    config: &ToolSearchConfig,
333) -> AgentInferenceRequest {
334    AgentInferenceRequest {
335        model: ModelSelection {
336            provider: fixture.provider.clone(),
337            model: fixture.model.clone(),
338        },
339        instructions: InstructionBundle {
340            system: Some("offline tool-search eval".to_string()),
341            developer: None,
342            developer_context: None,
343        },
344        transcript: vec![roder_api::transcript::TranscriptItem::UserMessage(
345            roder_api::transcript::UserMessage::text("run the fixture task"),
346        )],
347        tools: catalog.to_vec(),
348        tool_choice: ToolChoice::Auto,
349        reasoning: ReasoningConfig {
350            enabled: false,
351            level: None,
352        },
353        output: OutputConfig {
354            max_tokens: Some(512),
355            temperature: None,
356            top_p: None,
357            response_format: None,
358        },
359        runtime: RuntimeHints {
360            tool_search: config.clone(),
361            ..RuntimeHints::default()
362        },
363        metadata: json!({}),
364    }
365}
366
367/// Count deferred tool entries and detect the provider-native search entry in
368/// a mapped OpenAI Responses or Anthropic Messages request body.
369fn body_tool_search_shape(body: &Value) -> (usize, bool) {
370    let Some(tools) = body.get("tools").and_then(Value::as_array) else {
371        return (0, false);
372    };
373    let deferred = tools
374        .iter()
375        .filter(|tool| tool.get("defer_loading").and_then(Value::as_bool) == Some(true))
376        .count();
377    let native_entry = tools.iter().any(|tool| {
378        tool.get("type")
379            .and_then(Value::as_str)
380            .is_some_and(|kind| kind == "tool_search" || kind.starts_with("tool_search_tool_"))
381    });
382    (deferred, native_entry)
383}
384
385pub fn assert_tool_search_fixture(fixture: &ToolSearchEvalFixture) -> anyhow::Result<()> {
386    let outcome = run_tool_search_fixture(fixture)?;
387    let expected = &fixture.expected;
388    let body = match &outcome {
389        ToolSearchOutcome::RequestMapped { body, .. }
390        | ToolSearchOutcome::Executed { body, .. } => Some(body.clone()),
391        ToolSearchOutcome::FailClosed { .. } => None,
392    };
393
394    match (expected.outcome, &outcome) {
395        (
396            ToolSearchExpectedOutcome::RequestMapped,
397            ToolSearchOutcome::RequestMapped {
398                catalog_items,
399                deferred_tools,
400                native_tool_search_entry,
401                ..
402            },
403        ) => {
404            if let Some(expected_items) = expected.catalog_items {
405                anyhow::ensure!(
406                    *catalog_items == expected_items,
407                    "{}: expected {expected_items} catalog items, got {catalog_items}",
408                    fixture.id
409                );
410            }
411            if let Some(expected_deferred) = expected.deferred_tools {
412                anyhow::ensure!(
413                    *deferred_tools == expected_deferred,
414                    "{}: expected {expected_deferred} deferred tools, got {deferred_tools}",
415                    fixture.id
416                );
417            }
418            if let Some(expected_entry) = expected.native_tool_search_entry {
419                anyhow::ensure!(
420                    *native_tool_search_entry == expected_entry,
421                    "{}: expected native tool-search entry = {expected_entry}",
422                    fixture.id
423                );
424            }
425        }
426        (
427            ToolSearchExpectedOutcome::Executed,
428            ToolSearchOutcome::Executed { executed_tools, .. },
429        ) => {
430            anyhow::ensure!(
431                *executed_tools == expected.executed_tools,
432                "{}: expected executed tools {:?}, got {executed_tools:?}",
433                fixture.id,
434                expected.executed_tools
435            );
436        }
437        (ToolSearchExpectedOutcome::FailClosed, ToolSearchOutcome::FailClosed { diagnostic }) => {
438            for needle in &expected.diagnostic_contains {
439                anyhow::ensure!(
440                    diagnostic.contains(needle),
441                    "{}: diagnostic {diagnostic:?} missing {needle:?}",
442                    fixture.id
443                );
444            }
445        }
446        (expected_outcome, actual) => anyhow::bail!(
447            "{}: expected outcome {expected_outcome:?}, got {actual:?}",
448            fixture.id
449        ),
450    }
451
452    if let Some(body) = body {
453        let serialized = body.to_string();
454        for marker in &expected.body_must_not_contain {
455            anyhow::ensure!(
456                !serialized.contains(marker),
457                "{}: provider request body leaked redacted marker {marker:?}",
458                fixture.id
459            );
460        }
461        for tool in fixture
462            .catalog
463            .tools
464            .iter()
465            .flat_map(|tool| tool.internal_metadata.values())
466        {
467            anyhow::ensure!(
468                !serialized.contains(tool.as_str()),
469                "{}: provider request body leaked internal metadata value {tool:?}",
470                fixture.id
471            );
472        }
473    }
474    Ok(())
475}
476
477fn default_true() -> bool {
478    true
479}
480
481#[cfg(test)]
482#[path = "tool_search_tests.rs"]
483mod tool_search_tests;