Skip to main content

harn_vm/llm/
tool_conformance.rs

1//! One-tool provider conformance probe for local/runtime tool calling.
2//!
3//! The probe is deliberately tiny: define one harmless `echo_marker` tool,
4//! ask the model to call it with a fixed marker, and classify what came back.
5//! The classification is the stable contract eval harnesses consume; the live
6//! HTTP runner is a convenience around that classifier.
7
8use std::collections::BTreeMap;
9
10use serde::{Deserialize, Serialize};
11use serde_json::{json, Value};
12
13use crate::llm_config::{self, ProviderDef};
14use crate::value::VmValue;
15
16pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
17pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
18pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
19
20#[derive(Debug, Clone)]
21pub struct ToolConformanceProbeOptions {
22    pub provider: String,
23    pub model: String,
24    pub base_url: Option<String>,
25    pub modes: Vec<ToolProbeMode>,
26    pub marker: String,
27    pub timeout_secs: u64,
28}
29
30impl ToolConformanceProbeOptions {
31    pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
32        Self {
33            provider: provider.into(),
34            model: model.into(),
35            base_url: None,
36            modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
37            marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
38            timeout_secs: 120,
39        }
40    }
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
44#[serde(rename_all = "snake_case")]
45pub enum ToolProbeMode {
46    NonStreaming,
47    Streaming,
48}
49
50impl ToolProbeMode {
51    pub fn as_str(self) -> &'static str {
52        match self {
53            Self::NonStreaming => "non_streaming",
54            Self::Streaming => "streaming",
55        }
56    }
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60#[serde(rename_all = "snake_case")]
61pub enum ToolProbeClassification {
62    StructuredNativeToolCall,
63    ParseableHarnTextToolCall,
64    RawModelToolTag,
65    ProseOnlyNonTool,
66    MalformedJsonArguments,
67    EmptySilent,
68    HttpError,
69    TransportError,
70}
71
72#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
73#[serde(rename_all = "snake_case")]
74pub enum ToolProbeStatus {
75    Pass,
76    Fail,
77    Unknown,
78}
79
80impl ToolProbeStatus {
81    pub fn as_str(&self) -> &'static str {
82        match self {
83            Self::Pass => "pass",
84            Self::Fail => "fail",
85            Self::Unknown => "unknown",
86        }
87    }
88}
89
90#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum ToolProbeFallbackMode {
93    Native,
94    Text,
95    Disabled,
96}
97
98impl ToolProbeFallbackMode {
99    pub fn as_str(&self) -> &'static str {
100        match self {
101            Self::Native => "native",
102            Self::Text => "text",
103            Self::Disabled => "disabled",
104        }
105    }
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct ToolConformanceReport {
110    pub schema_version: u32,
111    pub provider: String,
112    pub model: String,
113    #[serde(skip_serializing_if = "Option::is_none")]
114    pub base_url: Option<String>,
115    pub tool_name: String,
116    pub marker: String,
117    pub cases: Vec<ToolConformanceCase>,
118    pub tool_calling: ToolCallingConformanceSummary,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct ToolCallingConformanceSummary {
123    pub native: ToolProbeStatus,
124    pub text: ToolProbeStatus,
125    pub streaming_native: ToolProbeStatus,
126    pub fallback_mode: ToolProbeFallbackMode,
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub failure_reason: Option<String>,
129}
130
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct ToolConformanceCase {
133    pub mode: ToolProbeMode,
134    pub ok: bool,
135    pub classification: ToolProbeClassification,
136    pub fallback_mode: ToolProbeFallbackMode,
137    #[serde(skip_serializing_if = "Option::is_none")]
138    pub failure_reason: Option<String>,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub http_status: Option<u16>,
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub elapsed_ms: Option<u64>,
143    pub native_tool_call_count: usize,
144    pub text_tool_call_count: usize,
145    #[serde(skip_serializing_if = "Vec::is_empty")]
146    pub parser_errors: Vec<String>,
147    #[serde(skip_serializing_if = "Vec::is_empty")]
148    pub protocol_violations: Vec<String>,
149    #[serde(skip_serializing_if = "Option::is_none")]
150    pub content_sample: Option<String>,
151}
152
153impl ToolConformanceCase {
154    fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
155        Self {
156            mode,
157            ok: false,
158            classification: ToolProbeClassification::TransportError,
159            fallback_mode: ToolProbeFallbackMode::Disabled,
160            failure_reason: Some(message),
161            http_status: None,
162            elapsed_ms,
163            native_tool_call_count: 0,
164            text_tool_call_count: 0,
165            parser_errors: Vec::new(),
166            protocol_violations: Vec::new(),
167            content_sample: None,
168        }
169    }
170
171    fn http_error(
172        mode: ToolProbeMode,
173        status: u16,
174        message: String,
175        elapsed_ms: Option<u64>,
176    ) -> Self {
177        Self {
178            mode,
179            ok: false,
180            classification: ToolProbeClassification::HttpError,
181            fallback_mode: ToolProbeFallbackMode::Disabled,
182            failure_reason: Some(message),
183            http_status: Some(status),
184            elapsed_ms,
185            native_tool_call_count: 0,
186            text_tool_call_count: 0,
187            parser_errors: Vec::new(),
188            protocol_violations: Vec::new(),
189            content_sample: None,
190        }
191    }
192}
193
194pub async fn run_tool_conformance_probe(
195    options: ToolConformanceProbeOptions,
196) -> ToolConformanceReport {
197    let model = llm_config::resolve_model_info(&options.model);
198    let provider = if options.provider.trim().is_empty() {
199        model.provider.clone()
200    } else {
201        options.provider.clone()
202    };
203    let model_id = model.id;
204    let base_url = options.base_url.clone().or_else(|| {
205        llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
206    });
207    let mut cases = Vec::new();
208    for mode in normalized_modes(&options.modes) {
209        cases.push(
210            execute_live_probe_case(
211                &provider,
212                &model_id,
213                base_url.as_deref(),
214                mode,
215                &options.marker,
216                options.timeout_secs,
217            )
218            .await,
219        );
220    }
221    report_from_cases(provider, model_id, base_url, options.marker, cases)
222}
223
224pub fn classify_tool_conformance_fixture(
225    provider: impl Into<String>,
226    model: impl Into<String>,
227    mode: ToolProbeMode,
228    marker: impl Into<String>,
229    raw: &str,
230) -> ToolConformanceReport {
231    let marker = marker.into();
232    let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
233    let case = classify_tool_probe_response(mode, &response, &marker, None, None);
234    report_from_cases(provider.into(), model.into(), None, marker, vec![case])
235}
236
237pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
238    match requirement {
239        "tool_probe" | "tool_call_probe" => {
240            report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
241                && report.cases.iter().any(|case| case.ok)
242        }
243        "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
244        "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
245        _ => false,
246    }
247}
248
249fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
250    if modes.is_empty() {
251        return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
252    }
253    let mut out = Vec::new();
254    for mode in modes {
255        if !out.contains(mode) {
256            out.push(*mode);
257        }
258    }
259    out
260}
261
262fn report_from_cases(
263    provider: String,
264    model: String,
265    base_url: Option<String>,
266    marker: String,
267    cases: Vec<ToolConformanceCase>,
268) -> ToolConformanceReport {
269    let summary = summarize_cases(&cases);
270    ToolConformanceReport {
271        schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
272        provider,
273        model,
274        base_url,
275        tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
276        marker,
277        cases,
278        tool_calling: summary,
279    }
280}
281
282fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
283    let mut native = ToolProbeStatus::Unknown;
284    let mut streaming_native = ToolProbeStatus::Unknown;
285    let mut text = ToolProbeStatus::Unknown;
286
287    for case in cases {
288        if case.classification == ToolProbeClassification::StructuredNativeToolCall {
289            if case.mode == ToolProbeMode::Streaming {
290                streaming_native = if case.ok {
291                    ToolProbeStatus::Pass
292                } else {
293                    ToolProbeStatus::Fail
294                };
295            } else {
296                native = if case.ok {
297                    ToolProbeStatus::Pass
298                } else {
299                    ToolProbeStatus::Fail
300                };
301            }
302        } else if case.mode == ToolProbeMode::Streaming
303            && streaming_native == ToolProbeStatus::Unknown
304        {
305            streaming_native = ToolProbeStatus::Fail;
306        } else if case.mode == ToolProbeMode::NonStreaming && native == ToolProbeStatus::Unknown {
307            native = ToolProbeStatus::Fail;
308        }
309
310        if case.classification == ToolProbeClassification::ParseableHarnTextToolCall {
311            text = if case.ok {
312                ToolProbeStatus::Pass
313            } else {
314                ToolProbeStatus::Fail
315            };
316        } else if text == ToolProbeStatus::Unknown && case.text_tool_call_count > 0 {
317            text = ToolProbeStatus::Fail;
318        }
319    }
320
321    let fallback_mode =
322        if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
323            ToolProbeFallbackMode::Native
324        } else if text == ToolProbeStatus::Pass {
325            ToolProbeFallbackMode::Text
326        } else {
327            ToolProbeFallbackMode::Disabled
328        };
329
330    let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
331        cases.iter().find_map(|case| case.failure_reason.clone())
332    } else {
333        None
334    };
335
336    ToolCallingConformanceSummary {
337        native,
338        text,
339        streaming_native,
340        fallback_mode,
341        failure_reason,
342    }
343}
344
345async fn execute_live_probe_case(
346    provider: &str,
347    model: &str,
348    base_url: Option<&str>,
349    mode: ToolProbeMode,
350    marker: &str,
351    timeout_secs: u64,
352) -> ToolConformanceCase {
353    let clock = harn_clock::RealClock::arc();
354    let started_ms = clock.monotonic_ms();
355    let Some(def) = llm_config::provider_config(provider) else {
356        return ToolConformanceCase::transport_error(
357            mode,
358            format!("unknown provider: {provider}"),
359            Some(elapsed_ms(&*clock, started_ms)),
360        );
361    };
362    let base_url = base_url
363        .filter(|value| !value.trim().is_empty())
364        .map(str::to_string)
365        .unwrap_or_else(|| llm_config::resolve_base_url(&def));
366    let url = match chat_url(&def, &base_url) {
367        Ok(url) => url,
368        Err(message) => {
369            return ToolConformanceCase::transport_error(
370                mode,
371                message,
372                Some(elapsed_ms(&*clock, started_ms)),
373            );
374        }
375    };
376    let body = probe_request_body(provider, model, mode, marker);
377    let client = if mode == ToolProbeMode::Streaming {
378        crate::llm::shared_streaming_client().clone()
379    } else {
380        crate::llm::shared_blocking_client().clone()
381    };
382    let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
383    let request = client
384        .post(&url)
385        .header("Content-Type", "application/json")
386        .timeout(std::time::Duration::from_secs(timeout_secs))
387        .json(&body);
388    let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
389    for (name, value) in &def.extra_headers {
390        request = request.header(name.as_str(), value.as_str());
391    }
392
393    let response = match request.send().await {
394        Ok(response) => response,
395        Err(error) => {
396            return ToolConformanceCase::transport_error(
397                mode,
398                format!("provider request failed: {error}"),
399                Some(elapsed_ms(&*clock, started_ms)),
400            );
401        }
402    };
403    let status = response.status();
404    let text = match response.text().await {
405        Ok(text) => text,
406        Err(error) => {
407            return ToolConformanceCase::transport_error(
408                mode,
409                format!("provider response was unreadable: {error}"),
410                Some(elapsed_ms(&*clock, started_ms)),
411            );
412        }
413    };
414    let elapsed = Some(elapsed_ms(&*clock, started_ms));
415    if !status.is_success() {
416        return ToolConformanceCase::http_error(
417            mode,
418            status.as_u16(),
419            sample_failure(&text, "provider returned non-success HTTP status"),
420            elapsed,
421        );
422    }
423    let response_value = if mode == ToolProbeMode::Streaming {
424        aggregate_stream_text(&text, provider)
425    } else {
426        serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
427    };
428    classify_tool_probe_response(
429        mode,
430        &response_value,
431        marker,
432        Some(status.as_u16()),
433        elapsed,
434    )
435}
436
437fn classify_tool_probe_response(
438    mode: ToolProbeMode,
439    response: &Value,
440    marker: &str,
441    http_status: Option<u16>,
442    elapsed_ms: Option<u64>,
443) -> ToolConformanceCase {
444    let native = extract_native_tool_calls(response);
445    let native_count = native.len();
446    let mut malformed_native = false;
447    for call in &native {
448        if call.name == TOOL_PROBE_TOOL_NAME {
449            match &call.arguments {
450                Some(Value::Object(map))
451                    if map.get("value").and_then(Value::as_str) == Some(marker) =>
452                {
453                    return ToolConformanceCase {
454                        mode,
455                        ok: true,
456                        classification: ToolProbeClassification::StructuredNativeToolCall,
457                        fallback_mode: ToolProbeFallbackMode::Native,
458                        failure_reason: None,
459                        http_status,
460                        elapsed_ms,
461                        native_tool_call_count: native_count,
462                        text_tool_call_count: 0,
463                        parser_errors: Vec::new(),
464                        protocol_violations: Vec::new(),
465                        content_sample: content_sample(response),
466                    };
467                }
468                Some(Value::Object(_)) => {}
469                _ => malformed_native = true,
470            }
471        }
472    }
473
474    let content = extract_content(response);
475    let tools = probe_tool_registry();
476    let parsed = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
477    let text_count = parsed.calls.len();
478    let text_pass = parsed.calls.iter().any(|call| {
479        call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
480            && call
481                .get("arguments")
482                .and_then(|args| args.get("value"))
483                .and_then(Value::as_str)
484                == Some(marker)
485    });
486    if text_pass {
487        return ToolConformanceCase {
488            mode,
489            ok: true,
490            classification: ToolProbeClassification::ParseableHarnTextToolCall,
491            fallback_mode: ToolProbeFallbackMode::Text,
492            failure_reason: None,
493            http_status,
494            elapsed_ms,
495            native_tool_call_count: native_count,
496            text_tool_call_count: text_count,
497            parser_errors: parsed.errors,
498            protocol_violations: parsed.violations,
499            content_sample: sample_content(&content),
500        };
501    }
502
503    let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
504        (
505            ToolProbeClassification::MalformedJsonArguments,
506            Some(first_non_empty(
507                parsed.errors.first().cloned(),
508                "malformed_tool_arguments",
509            )),
510        )
511    } else if content.trim().is_empty() && native_count == 0 {
512        (
513            ToolProbeClassification::EmptySilent,
514            Some("empty_silent_response".to_string()),
515        )
516    } else if has_raw_model_tool_tag(&content) {
517        (
518            ToolProbeClassification::RawModelToolTag,
519            Some("raw_tool_tag_no_structured_calls".to_string()),
520        )
521    } else {
522        (
523            ToolProbeClassification::ProseOnlyNonTool,
524            Some("no_executable_tool_call".to_string()),
525        )
526    };
527
528    ToolConformanceCase {
529        mode,
530        ok: false,
531        classification,
532        fallback_mode: ToolProbeFallbackMode::Disabled,
533        failure_reason,
534        http_status,
535        elapsed_ms,
536        native_tool_call_count: native_count,
537        text_tool_call_count: text_count,
538        parser_errors: parsed.errors,
539        protocol_violations: parsed.violations,
540        content_sample: sample_content(&content),
541    }
542}
543
544fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
545    let endpoint = if def.chat_endpoint.trim().is_empty() {
546        "/v1/chat/completions"
547    } else {
548        def.chat_endpoint.as_str()
549    };
550    let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
551        endpoint.to_string()
552    } else if endpoint.starts_with('/') {
553        format!("{}{}", base_url.trim_end_matches('/'), endpoint)
554    } else {
555        format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
556    };
557    reqwest::Url::parse(&url)
558        .map(|_| url.clone())
559        .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
560}
561
562fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
563    let prompt = format!(
564        "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
565    );
566    let tool = json!({
567        "type": "function",
568        "function": {
569            "name": TOOL_PROBE_TOOL_NAME,
570            "description": "Echo the probe marker exactly.",
571            "parameters": {
572                "type": "object",
573                "properties": {
574                    "value": {
575                        "type": "string",
576                        "description": "The marker value to echo."
577                    }
578                },
579                "required": ["value"],
580                "additionalProperties": false
581            }
582        }
583    });
584    let mut body = json!({
585        "model": model,
586        "messages": [{"role": "user", "content": prompt}],
587        "tools": [tool],
588        "stream": mode == ToolProbeMode::Streaming,
589        "temperature": 0,
590    });
591    if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
592        body["tool_choice"] = json!({
593            "type": "function",
594            "function": {"name": TOOL_PROBE_TOOL_NAME}
595        });
596    }
597    body
598}
599
600#[derive(Debug)]
601struct NativeToolCall {
602    name: String,
603    arguments: Option<Value>,
604}
605
606fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
607    let mut calls = Vec::new();
608    visit_native_tool_call_arrays(response, &mut calls);
609    calls
610}
611
612fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
613    match value {
614        Value::Object(map) => {
615            if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
616                for item in tool_calls {
617                    if let Some(call) = parse_native_tool_call(item) {
618                        calls.push(call);
619                    }
620                }
621            }
622            for child in map.values() {
623                visit_native_tool_call_arrays(child, calls);
624            }
625        }
626        Value::Array(items) => {
627            for item in items {
628                visit_native_tool_call_arrays(item, calls);
629            }
630        }
631        _ => {}
632    }
633}
634
635fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
636    let obj = item.as_object()?;
637    let function = obj.get("function").and_then(Value::as_object);
638    let name = function
639        .and_then(|function| function.get("name"))
640        .or_else(|| obj.get("name"))
641        .and_then(Value::as_str)?
642        .to_string();
643    let raw_args = function
644        .and_then(|function| function.get("arguments"))
645        .or_else(|| obj.get("arguments"));
646    let arguments = match raw_args {
647        Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
648        Some(value @ Value::Object(_)) => Some(value.clone()),
649        Some(_) => None,
650        None => Some(json!({})),
651    };
652    Some(NativeToolCall { name, arguments })
653}
654
655fn extract_content(response: &Value) -> String {
656    let mut parts = Vec::new();
657    visit_content(response, &mut parts);
658    parts
659        .into_iter()
660        .filter(|part| !part.trim().is_empty())
661        .collect::<Vec<_>>()
662        .join("\n")
663}
664
665fn visit_content(value: &Value, parts: &mut Vec<String>) {
666    match value {
667        Value::Object(map) => {
668            for key in ["content", "response", "text"] {
669                if let Some(text) = map.get(key).and_then(Value::as_str) {
670                    parts.push(text.to_string());
671                }
672            }
673            for child in map.values() {
674                visit_content(child, parts);
675            }
676        }
677        Value::Array(items) => {
678            for item in items {
679                visit_content(item, parts);
680            }
681        }
682        _ => {}
683    }
684}
685
686fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
687    let mut content = String::new();
688    let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
689    let mut frames = Vec::new();
690    for raw_line in text.lines() {
691        let line = raw_line.trim();
692        if line.is_empty() {
693            continue;
694        }
695        let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
696        if payload == "[DONE]" {
697            continue;
698        }
699        let Ok(frame) = serde_json::from_str::<Value>(payload) else {
700            continue;
701        };
702        collect_stream_content_and_calls(&frame, &mut content, &mut calls);
703        frames.push(frame);
704    }
705    let tool_calls: Vec<Value> = calls
706        .into_values()
707        .map(|call| {
708            json!({
709                "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
710                "type": "function",
711                "function": {
712                    "name": call.name.unwrap_or_default(),
713                    "arguments": call.arguments,
714                }
715            })
716        })
717        .collect();
718    json!({
719        "content": content,
720        "tool_calls": tool_calls,
721        "frames": frames,
722    })
723}
724
725#[derive(Debug, Default)]
726struct PartialStreamCall {
727    id: Option<String>,
728    name: Option<String>,
729    arguments: String,
730}
731
732fn collect_stream_content_and_calls(
733    frame: &Value,
734    content: &mut String,
735    calls: &mut BTreeMap<String, PartialStreamCall>,
736) {
737    if let Some(text) = frame
738        .pointer("/message/content")
739        .or_else(|| frame.pointer("/choices/0/delta/content"))
740        .or_else(|| frame.pointer("/choices/0/message/content"))
741        .or_else(|| frame.get("response"))
742        .and_then(Value::as_str)
743    {
744        content.push_str(text);
745    }
746    for item in frame
747        .pointer("/message/tool_calls")
748        .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
749        .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
750        .and_then(Value::as_array)
751        .into_iter()
752        .flatten()
753    {
754        let key = item
755            .get("index")
756            .and_then(Value::as_u64)
757            .map(|index| index.to_string())
758            .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
759            .unwrap_or_else(|| calls.len().to_string());
760        let slot = calls.entry(key).or_default();
761        if let Some(id) = item.get("id").and_then(Value::as_str) {
762            slot.id = Some(id.to_string());
763        }
764        if let Some(name) = item
765            .pointer("/function/name")
766            .or_else(|| item.get("name"))
767            .and_then(Value::as_str)
768        {
769            slot.name = Some(name.to_string());
770        }
771        if let Some(arguments) = item
772            .pointer("/function/arguments")
773            .or_else(|| item.get("arguments"))
774        {
775            match arguments {
776                Value::String(delta) => slot.arguments.push_str(delta),
777                Value::Object(_) => slot.arguments = arguments.to_string(),
778                _ => {}
779            }
780        }
781    }
782}
783
784fn probe_tool_registry() -> VmValue {
785    let mut value_param = BTreeMap::new();
786    value_param.insert("type".to_string(), vm_str("string"));
787    value_param.insert(
788        "description".to_string(),
789        vm_str("The marker value to echo."),
790    );
791    let mut params = BTreeMap::new();
792    params.insert(
793        "value".to_string(),
794        VmValue::Dict(std::sync::Arc::new(value_param)),
795    );
796    let tool = vm_dict(&[
797        ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
798        ("description", vm_str("Echo the probe marker exactly.")),
799        ("parameters", VmValue::Dict(std::sync::Arc::new(params))),
800    ]);
801    vm_dict(&[("tools", VmValue::List(std::sync::Arc::new(vec![tool])))])
802}
803
804fn vm_str(value: &str) -> VmValue {
805    VmValue::String(std::sync::Arc::from(value))
806}
807
808fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
809    let mut map = BTreeMap::new();
810    for (key, value) in pairs {
811        map.insert((*key).to_string(), value.clone());
812    }
813    VmValue::Dict(std::sync::Arc::new(map))
814}
815
816fn has_raw_model_tool_tag(content: &str) -> bool {
817    let lowered = content.to_ascii_lowercase();
818    lowered.contains("<tool_call")
819        || lowered.contains("<toolcall")
820        || lowered.contains("tool_code:")
821        || lowered.contains("tool_call:")
822        || lowered.contains("call:")
823        || lowered.contains("<function")
824}
825
826fn content_sample(response: &Value) -> Option<String> {
827    sample_content(&extract_content(response))
828}
829
830fn sample_content(content: &str) -> Option<String> {
831    let trimmed = content.trim();
832    if trimmed.is_empty() {
833        None
834    } else {
835        Some(trimmed.chars().take(240).collect())
836    }
837}
838
839fn sample_failure(text: &str, fallback: &str) -> String {
840    let trimmed = text.trim();
841    if trimmed.is_empty() {
842        fallback.to_string()
843    } else {
844        format!(
845            "{fallback}: {}",
846            trimmed.chars().take(240).collect::<String>()
847        )
848    }
849}
850
851fn first_non_empty(value: Option<String>, fallback: &str) -> String {
852    value
853        .filter(|value| !value.trim().is_empty())
854        .unwrap_or_else(|| fallback.to_string())
855}
856
857fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
858    clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864
865    #[test]
866    fn classify_openai_native_tool_call_as_pass() {
867        let report = classify_tool_conformance_fixture(
868            "local",
869            "model",
870            ToolProbeMode::NonStreaming,
871            DEFAULT_TOOL_PROBE_MARKER,
872            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
873        );
874        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
875        assert_eq!(
876            report.tool_calling.fallback_mode,
877            ToolProbeFallbackMode::Native
878        );
879        assert_eq!(
880            report.cases[0].classification,
881            ToolProbeClassification::StructuredNativeToolCall
882        );
883    }
884
885    #[test]
886    fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
887        let report = classify_tool_conformance_fixture(
888            "ollama",
889            "gemma4:26b",
890            ToolProbeMode::NonStreaming,
891            DEFAULT_TOOL_PROBE_MARKER,
892            r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
893        );
894        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
895        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
896        assert_eq!(
897            report.tool_calling.fallback_mode,
898            ToolProbeFallbackMode::Text
899        );
900        assert_eq!(
901            report.cases[0].classification,
902            ToolProbeClassification::ParseableHarnTextToolCall
903        );
904    }
905
906    #[test]
907    fn classify_qwen_call_colon_marker_as_text_fallback() {
908        let report = classify_tool_conformance_fixture(
909            "llamacpp",
910            "qwen",
911            ToolProbeMode::NonStreaming,
912            DEFAULT_TOOL_PROBE_MARKER,
913            r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
914        );
915        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
916        assert_eq!(
917            report.tool_calling.fallback_mode,
918            ToolProbeFallbackMode::Text
919        );
920    }
921
922    #[test]
923    fn classify_prose_only_as_disabled() {
924        let report = classify_tool_conformance_fixture(
925            "ollama",
926            "gemma4:26b",
927            ToolProbeMode::NonStreaming,
928            DEFAULT_TOOL_PROBE_MARKER,
929            r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
930        );
931        assert_eq!(
932            report.tool_calling.fallback_mode,
933            ToolProbeFallbackMode::Disabled
934        );
935        assert_eq!(
936            report.cases[0].classification,
937            ToolProbeClassification::ProseOnlyNonTool
938        );
939        assert_eq!(
940            report.cases[0].failure_reason.as_deref(),
941            Some("no_executable_tool_call")
942        );
943    }
944
945    #[test]
946    fn aggregates_openai_streaming_tool_call_deltas() {
947        let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
948                   data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
949                   data: [DONE]\n";
950        let response = aggregate_stream_text(raw, "local");
951        let case = classify_tool_probe_response(
952            ToolProbeMode::Streaming,
953            &response,
954            DEFAULT_TOOL_PROBE_MARKER,
955            None,
956            None,
957        );
958        assert!(case.ok, "{case:?}");
959        assert_eq!(
960            case.classification,
961            ToolProbeClassification::StructuredNativeToolCall
962        );
963    }
964
965    #[test]
966    fn report_satisfies_tool_probe_when_text_fallback_passes() {
967        let report = classify_tool_conformance_fixture(
968            "llamacpp",
969            "qwen",
970            ToolProbeMode::NonStreaming,
971            DEFAULT_TOOL_PROBE_MARKER,
972            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
973        );
974        assert!(report_satisfies_required_probe(&report, "tool_probe"));
975        assert!(!report_satisfies_required_probe(
976            &report,
977            "native_tool_probe"
978        ));
979    }
980}