Skip to main content

harn_vm/llm/
tool_conformance.rs

1//! One-tool provider conformance probe for local/runtime tool calling.
2//!
3//! The probe is deliberately tiny: define one harmless `echo_marker` tool,
4//! ask the model to call it with a fixed marker, and classify what came back.
5//! The classification is the stable contract eval harnesses consume; the live
6//! HTTP runner is a convenience around that classifier.
7
8use std::collections::BTreeMap;
9use std::rc::Rc;
10
11use serde::{Deserialize, Serialize};
12use serde_json::{json, Value};
13
14use crate::llm_config::{self, ProviderDef};
15use crate::value::VmValue;
16
17pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
18pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
19pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
20
21#[derive(Debug, Clone)]
22pub struct ToolConformanceProbeOptions {
23    pub provider: String,
24    pub model: String,
25    pub base_url: Option<String>,
26    pub modes: Vec<ToolProbeMode>,
27    pub marker: String,
28    pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32    pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33        Self {
34            provider: provider.into(),
35            model: model.into(),
36            base_url: None,
37            modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38            marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39            timeout_secs: 120,
40        }
41    }
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45#[serde(rename_all = "snake_case")]
46pub enum ToolProbeMode {
47    NonStreaming,
48    Streaming,
49}
50
51impl ToolProbeMode {
52    pub fn as_str(self) -> &'static str {
53        match self {
54            Self::NonStreaming => "non_streaming",
55            Self::Streaming => "streaming",
56        }
57    }
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61#[serde(rename_all = "snake_case")]
62pub enum ToolProbeClassification {
63    StructuredNativeToolCall,
64    ParseableHarnTextToolCall,
65    RawModelToolTag,
66    ProseOnlyNonTool,
67    MalformedJsonArguments,
68    EmptySilent,
69    HttpError,
70    TransportError,
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum ToolProbeStatus {
76    Pass,
77    Fail,
78    Unknown,
79}
80
81impl ToolProbeStatus {
82    pub fn as_str(&self) -> &'static str {
83        match self {
84            Self::Pass => "pass",
85            Self::Fail => "fail",
86            Self::Unknown => "unknown",
87        }
88    }
89}
90
91#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum ToolProbeFallbackMode {
94    Native,
95    Text,
96    Disabled,
97}
98
99impl ToolProbeFallbackMode {
100    pub fn as_str(&self) -> &'static str {
101        match self {
102            Self::Native => "native",
103            Self::Text => "text",
104            Self::Disabled => "disabled",
105        }
106    }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ToolConformanceReport {
111    pub schema_version: u32,
112    pub provider: String,
113    pub model: String,
114    #[serde(skip_serializing_if = "Option::is_none")]
115    pub base_url: Option<String>,
116    pub tool_name: String,
117    pub marker: String,
118    pub cases: Vec<ToolConformanceCase>,
119    pub tool_calling: ToolCallingConformanceSummary,
120}
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct ToolCallingConformanceSummary {
124    pub native: ToolProbeStatus,
125    pub text: ToolProbeStatus,
126    pub streaming_native: ToolProbeStatus,
127    pub fallback_mode: ToolProbeFallbackMode,
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub failure_reason: Option<String>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct ToolConformanceCase {
134    pub mode: ToolProbeMode,
135    pub ok: bool,
136    pub classification: ToolProbeClassification,
137    pub fallback_mode: ToolProbeFallbackMode,
138    #[serde(skip_serializing_if = "Option::is_none")]
139    pub failure_reason: Option<String>,
140    #[serde(skip_serializing_if = "Option::is_none")]
141    pub http_status: Option<u16>,
142    #[serde(skip_serializing_if = "Option::is_none")]
143    pub elapsed_ms: Option<u64>,
144    pub native_tool_call_count: usize,
145    pub text_tool_call_count: usize,
146    #[serde(skip_serializing_if = "Vec::is_empty")]
147    pub parser_errors: Vec<String>,
148    #[serde(skip_serializing_if = "Vec::is_empty")]
149    pub protocol_violations: Vec<String>,
150    #[serde(skip_serializing_if = "Option::is_none")]
151    pub content_sample: Option<String>,
152}
153
154impl ToolConformanceCase {
155    fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
156        Self {
157            mode,
158            ok: false,
159            classification: ToolProbeClassification::TransportError,
160            fallback_mode: ToolProbeFallbackMode::Disabled,
161            failure_reason: Some(message),
162            http_status: None,
163            elapsed_ms,
164            native_tool_call_count: 0,
165            text_tool_call_count: 0,
166            parser_errors: Vec::new(),
167            protocol_violations: Vec::new(),
168            content_sample: None,
169        }
170    }
171
172    fn http_error(
173        mode: ToolProbeMode,
174        status: u16,
175        message: String,
176        elapsed_ms: Option<u64>,
177    ) -> Self {
178        Self {
179            mode,
180            ok: false,
181            classification: ToolProbeClassification::HttpError,
182            fallback_mode: ToolProbeFallbackMode::Disabled,
183            failure_reason: Some(message),
184            http_status: Some(status),
185            elapsed_ms,
186            native_tool_call_count: 0,
187            text_tool_call_count: 0,
188            parser_errors: Vec::new(),
189            protocol_violations: Vec::new(),
190            content_sample: None,
191        }
192    }
193}
194
195pub async fn run_tool_conformance_probe(
196    options: ToolConformanceProbeOptions,
197) -> ToolConformanceReport {
198    let model = llm_config::resolve_model_info(&options.model);
199    let provider = if options.provider.trim().is_empty() {
200        model.provider.clone()
201    } else {
202        options.provider.clone()
203    };
204    let model_id = model.id;
205    let base_url = options.base_url.clone().or_else(|| {
206        llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
207    });
208    let mut cases = Vec::new();
209    for mode in normalized_modes(&options.modes) {
210        cases.push(
211            execute_live_probe_case(
212                &provider,
213                &model_id,
214                base_url.as_deref(),
215                mode,
216                &options.marker,
217                options.timeout_secs,
218            )
219            .await,
220        );
221    }
222    report_from_cases(provider, model_id, base_url, options.marker, cases)
223}
224
225pub fn classify_tool_conformance_fixture(
226    provider: impl Into<String>,
227    model: impl Into<String>,
228    mode: ToolProbeMode,
229    marker: impl Into<String>,
230    raw: &str,
231) -> ToolConformanceReport {
232    let marker = marker.into();
233    let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
234    let case = classify_tool_probe_response(mode, &response, &marker, None, None);
235    report_from_cases(provider.into(), model.into(), None, marker, vec![case])
236}
237
238pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
239    match requirement {
240        "tool_probe" | "tool_call_probe" => {
241            report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
242                && report.cases.iter().any(|case| case.ok)
243        }
244        "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
245        "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
246        _ => false,
247    }
248}
249
250fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
251    if modes.is_empty() {
252        return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
253    }
254    let mut out = Vec::new();
255    for mode in modes {
256        if !out.contains(mode) {
257            out.push(*mode);
258        }
259    }
260    out
261}
262
263fn report_from_cases(
264    provider: String,
265    model: String,
266    base_url: Option<String>,
267    marker: String,
268    cases: Vec<ToolConformanceCase>,
269) -> ToolConformanceReport {
270    let summary = summarize_cases(&cases);
271    ToolConformanceReport {
272        schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
273        provider,
274        model,
275        base_url,
276        tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
277        marker,
278        cases,
279        tool_calling: summary,
280    }
281}
282
283fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
284    let mut native = ToolProbeStatus::Unknown;
285    let mut streaming_native = ToolProbeStatus::Unknown;
286    let mut text = ToolProbeStatus::Unknown;
287
288    for case in cases {
289        if case.classification == ToolProbeClassification::StructuredNativeToolCall {
290            if case.mode == ToolProbeMode::Streaming {
291                streaming_native = if case.ok {
292                    ToolProbeStatus::Pass
293                } else {
294                    ToolProbeStatus::Fail
295                };
296            } else {
297                native = if case.ok {
298                    ToolProbeStatus::Pass
299                } else {
300                    ToolProbeStatus::Fail
301                };
302            }
303        } else if case.mode == ToolProbeMode::Streaming
304            && streaming_native == ToolProbeStatus::Unknown
305        {
306            streaming_native = ToolProbeStatus::Fail;
307        } else if case.mode == ToolProbeMode::NonStreaming && native == ToolProbeStatus::Unknown {
308            native = ToolProbeStatus::Fail;
309        }
310
311        if case.classification == ToolProbeClassification::ParseableHarnTextToolCall {
312            text = if case.ok {
313                ToolProbeStatus::Pass
314            } else {
315                ToolProbeStatus::Fail
316            };
317        } else if text == ToolProbeStatus::Unknown && case.text_tool_call_count > 0 {
318            text = ToolProbeStatus::Fail;
319        }
320    }
321
322    let fallback_mode =
323        if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
324            ToolProbeFallbackMode::Native
325        } else if text == ToolProbeStatus::Pass {
326            ToolProbeFallbackMode::Text
327        } else {
328            ToolProbeFallbackMode::Disabled
329        };
330
331    let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
332        cases.iter().find_map(|case| case.failure_reason.clone())
333    } else {
334        None
335    };
336
337    ToolCallingConformanceSummary {
338        native,
339        text,
340        streaming_native,
341        fallback_mode,
342        failure_reason,
343    }
344}
345
346async fn execute_live_probe_case(
347    provider: &str,
348    model: &str,
349    base_url: Option<&str>,
350    mode: ToolProbeMode,
351    marker: &str,
352    timeout_secs: u64,
353) -> ToolConformanceCase {
354    let clock = harn_clock::RealClock::arc();
355    let started_ms = clock.monotonic_ms();
356    let Some(def) = llm_config::provider_config(provider) else {
357        return ToolConformanceCase::transport_error(
358            mode,
359            format!("unknown provider: {provider}"),
360            Some(elapsed_ms(&*clock, started_ms)),
361        );
362    };
363    let base_url = base_url
364        .filter(|value| !value.trim().is_empty())
365        .map(str::to_string)
366        .unwrap_or_else(|| llm_config::resolve_base_url(&def));
367    let url = match chat_url(&def, &base_url) {
368        Ok(url) => url,
369        Err(message) => {
370            return ToolConformanceCase::transport_error(
371                mode,
372                message,
373                Some(elapsed_ms(&*clock, started_ms)),
374            );
375        }
376    };
377    let body = probe_request_body(provider, model, mode, marker);
378    let client = if mode == ToolProbeMode::Streaming {
379        crate::llm::shared_streaming_client().clone()
380    } else {
381        crate::llm::shared_blocking_client().clone()
382    };
383    let api_key = crate::llm::helpers::resolve_api_key(provider)
384        .map(|value| value.to_string())
385        .unwrap_or_default();
386    let request = client
387        .post(&url)
388        .header("Content-Type", "application/json")
389        .timeout(std::time::Duration::from_secs(timeout_secs))
390        .json(&body);
391    let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
392    for (name, value) in &def.extra_headers {
393        request = request.header(name.as_str(), value.as_str());
394    }
395
396    let response = match request.send().await {
397        Ok(response) => response,
398        Err(error) => {
399            return ToolConformanceCase::transport_error(
400                mode,
401                format!("provider request failed: {error}"),
402                Some(elapsed_ms(&*clock, started_ms)),
403            );
404        }
405    };
406    let status = response.status();
407    let text = match response.text().await {
408        Ok(text) => text,
409        Err(error) => {
410            return ToolConformanceCase::transport_error(
411                mode,
412                format!("provider response was unreadable: {error}"),
413                Some(elapsed_ms(&*clock, started_ms)),
414            );
415        }
416    };
417    let elapsed = Some(elapsed_ms(&*clock, started_ms));
418    if !status.is_success() {
419        return ToolConformanceCase::http_error(
420            mode,
421            status.as_u16(),
422            sample_failure(&text, "provider returned non-success HTTP status"),
423            elapsed,
424        );
425    }
426    let response_value = if mode == ToolProbeMode::Streaming {
427        aggregate_stream_text(&text, provider)
428    } else {
429        serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
430    };
431    classify_tool_probe_response(
432        mode,
433        &response_value,
434        marker,
435        Some(status.as_u16()),
436        elapsed,
437    )
438}
439
440fn classify_tool_probe_response(
441    mode: ToolProbeMode,
442    response: &Value,
443    marker: &str,
444    http_status: Option<u16>,
445    elapsed_ms: Option<u64>,
446) -> ToolConformanceCase {
447    let native = extract_native_tool_calls(response);
448    let native_count = native.len();
449    let mut malformed_native = false;
450    for call in &native {
451        if call.name == TOOL_PROBE_TOOL_NAME {
452            match &call.arguments {
453                Some(Value::Object(map))
454                    if map.get("value").and_then(Value::as_str) == Some(marker) =>
455                {
456                    return ToolConformanceCase {
457                        mode,
458                        ok: true,
459                        classification: ToolProbeClassification::StructuredNativeToolCall,
460                        fallback_mode: ToolProbeFallbackMode::Native,
461                        failure_reason: None,
462                        http_status,
463                        elapsed_ms,
464                        native_tool_call_count: native_count,
465                        text_tool_call_count: 0,
466                        parser_errors: Vec::new(),
467                        protocol_violations: Vec::new(),
468                        content_sample: content_sample(response),
469                    };
470                }
471                Some(Value::Object(_)) => {}
472                _ => malformed_native = true,
473            }
474        }
475    }
476
477    let content = extract_content(response);
478    let tools = probe_tool_registry();
479    let parsed = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
480    let text_count = parsed.calls.len();
481    let text_pass = parsed.calls.iter().any(|call| {
482        call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
483            && call
484                .get("arguments")
485                .and_then(|args| args.get("value"))
486                .and_then(Value::as_str)
487                == Some(marker)
488    });
489    if text_pass {
490        return ToolConformanceCase {
491            mode,
492            ok: true,
493            classification: ToolProbeClassification::ParseableHarnTextToolCall,
494            fallback_mode: ToolProbeFallbackMode::Text,
495            failure_reason: None,
496            http_status,
497            elapsed_ms,
498            native_tool_call_count: native_count,
499            text_tool_call_count: text_count,
500            parser_errors: parsed.errors,
501            protocol_violations: parsed.violations,
502            content_sample: sample_content(&content),
503        };
504    }
505
506    let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
507        (
508            ToolProbeClassification::MalformedJsonArguments,
509            Some(first_non_empty(
510                parsed.errors.first().cloned(),
511                "malformed_tool_arguments",
512            )),
513        )
514    } else if content.trim().is_empty() && native_count == 0 {
515        (
516            ToolProbeClassification::EmptySilent,
517            Some("empty_silent_response".to_string()),
518        )
519    } else if has_raw_model_tool_tag(&content) {
520        (
521            ToolProbeClassification::RawModelToolTag,
522            Some("raw_tool_tag_no_structured_calls".to_string()),
523        )
524    } else {
525        (
526            ToolProbeClassification::ProseOnlyNonTool,
527            Some("no_executable_tool_call".to_string()),
528        )
529    };
530
531    ToolConformanceCase {
532        mode,
533        ok: false,
534        classification,
535        fallback_mode: ToolProbeFallbackMode::Disabled,
536        failure_reason,
537        http_status,
538        elapsed_ms,
539        native_tool_call_count: native_count,
540        text_tool_call_count: text_count,
541        parser_errors: parsed.errors,
542        protocol_violations: parsed.violations,
543        content_sample: sample_content(&content),
544    }
545}
546
547fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
548    let endpoint = if def.chat_endpoint.trim().is_empty() {
549        "/v1/chat/completions"
550    } else {
551        def.chat_endpoint.as_str()
552    };
553    let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
554        endpoint.to_string()
555    } else if endpoint.starts_with('/') {
556        format!("{}{}", base_url.trim_end_matches('/'), endpoint)
557    } else {
558        format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
559    };
560    reqwest::Url::parse(&url)
561        .map(|_| url.clone())
562        .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
563}
564
565fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
566    let prompt = format!(
567        "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
568    );
569    let tool = json!({
570        "type": "function",
571        "function": {
572            "name": TOOL_PROBE_TOOL_NAME,
573            "description": "Echo the probe marker exactly.",
574            "parameters": {
575                "type": "object",
576                "properties": {
577                    "value": {
578                        "type": "string",
579                        "description": "The marker value to echo."
580                    }
581                },
582                "required": ["value"],
583                "additionalProperties": false
584            }
585        }
586    });
587    let mut body = json!({
588        "model": model,
589        "messages": [{"role": "user", "content": prompt}],
590        "tools": [tool],
591        "stream": mode == ToolProbeMode::Streaming,
592        "temperature": 0,
593    });
594    if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
595        body["tool_choice"] = json!({
596            "type": "function",
597            "function": {"name": TOOL_PROBE_TOOL_NAME}
598        });
599    }
600    body
601}
602
603#[derive(Debug)]
604struct NativeToolCall {
605    name: String,
606    arguments: Option<Value>,
607}
608
609fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
610    let mut calls = Vec::new();
611    visit_native_tool_call_arrays(response, &mut calls);
612    calls
613}
614
615fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
616    match value {
617        Value::Object(map) => {
618            if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
619                for item in tool_calls {
620                    if let Some(call) = parse_native_tool_call(item) {
621                        calls.push(call);
622                    }
623                }
624            }
625            for child in map.values() {
626                visit_native_tool_call_arrays(child, calls);
627            }
628        }
629        Value::Array(items) => {
630            for item in items {
631                visit_native_tool_call_arrays(item, calls);
632            }
633        }
634        _ => {}
635    }
636}
637
638fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
639    let obj = item.as_object()?;
640    let function = obj.get("function").and_then(Value::as_object);
641    let name = function
642        .and_then(|function| function.get("name"))
643        .or_else(|| obj.get("name"))
644        .and_then(Value::as_str)?
645        .to_string();
646    let raw_args = function
647        .and_then(|function| function.get("arguments"))
648        .or_else(|| obj.get("arguments"));
649    let arguments = match raw_args {
650        Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
651        Some(value @ Value::Object(_)) => Some(value.clone()),
652        Some(_) => None,
653        None => Some(json!({})),
654    };
655    Some(NativeToolCall { name, arguments })
656}
657
658fn extract_content(response: &Value) -> String {
659    let mut parts = Vec::new();
660    visit_content(response, &mut parts);
661    parts
662        .into_iter()
663        .filter(|part| !part.trim().is_empty())
664        .collect::<Vec<_>>()
665        .join("\n")
666}
667
668fn visit_content(value: &Value, parts: &mut Vec<String>) {
669    match value {
670        Value::Object(map) => {
671            for key in ["content", "response", "text"] {
672                if let Some(text) = map.get(key).and_then(Value::as_str) {
673                    parts.push(text.to_string());
674                }
675            }
676            for child in map.values() {
677                visit_content(child, parts);
678            }
679        }
680        Value::Array(items) => {
681            for item in items {
682                visit_content(item, parts);
683            }
684        }
685        _ => {}
686    }
687}
688
689fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
690    let mut content = String::new();
691    let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
692    let mut frames = Vec::new();
693    for raw_line in text.lines() {
694        let line = raw_line.trim();
695        if line.is_empty() {
696            continue;
697        }
698        let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
699        if payload == "[DONE]" {
700            continue;
701        }
702        let Ok(frame) = serde_json::from_str::<Value>(payload) else {
703            continue;
704        };
705        collect_stream_content_and_calls(&frame, &mut content, &mut calls);
706        frames.push(frame);
707    }
708    let tool_calls: Vec<Value> = calls
709        .into_values()
710        .map(|call| {
711            json!({
712                "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
713                "type": "function",
714                "function": {
715                    "name": call.name.unwrap_or_default(),
716                    "arguments": call.arguments,
717                }
718            })
719        })
720        .collect();
721    json!({
722        "content": content,
723        "tool_calls": tool_calls,
724        "frames": frames,
725    })
726}
727
728#[derive(Debug, Default)]
729struct PartialStreamCall {
730    id: Option<String>,
731    name: Option<String>,
732    arguments: String,
733}
734
735fn collect_stream_content_and_calls(
736    frame: &Value,
737    content: &mut String,
738    calls: &mut BTreeMap<String, PartialStreamCall>,
739) {
740    if let Some(text) = frame
741        .pointer("/message/content")
742        .or_else(|| frame.pointer("/choices/0/delta/content"))
743        .or_else(|| frame.pointer("/choices/0/message/content"))
744        .or_else(|| frame.get("response"))
745        .and_then(Value::as_str)
746    {
747        content.push_str(text);
748    }
749    for item in frame
750        .pointer("/message/tool_calls")
751        .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
752        .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
753        .and_then(Value::as_array)
754        .into_iter()
755        .flatten()
756    {
757        let key = item
758            .get("index")
759            .and_then(Value::as_u64)
760            .map(|index| index.to_string())
761            .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
762            .unwrap_or_else(|| calls.len().to_string());
763        let slot = calls.entry(key).or_default();
764        if let Some(id) = item.get("id").and_then(Value::as_str) {
765            slot.id = Some(id.to_string());
766        }
767        if let Some(name) = item
768            .pointer("/function/name")
769            .or_else(|| item.get("name"))
770            .and_then(Value::as_str)
771        {
772            slot.name = Some(name.to_string());
773        }
774        if let Some(arguments) = item
775            .pointer("/function/arguments")
776            .or_else(|| item.get("arguments"))
777        {
778            match arguments {
779                Value::String(delta) => slot.arguments.push_str(delta),
780                Value::Object(_) => slot.arguments = arguments.to_string(),
781                _ => {}
782            }
783        }
784    }
785}
786
787fn probe_tool_registry() -> VmValue {
788    let mut value_param = BTreeMap::new();
789    value_param.insert("type".to_string(), vm_str("string"));
790    value_param.insert(
791        "description".to_string(),
792        vm_str("The marker value to echo."),
793    );
794    let mut params = BTreeMap::new();
795    params.insert("value".to_string(), VmValue::Dict(Rc::new(value_param)));
796    let tool = vm_dict(&[
797        ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
798        ("description", vm_str("Echo the probe marker exactly.")),
799        ("parameters", VmValue::Dict(Rc::new(params))),
800    ]);
801    vm_dict(&[("tools", VmValue::List(Rc::new(vec![tool])))])
802}
803
804fn vm_str(value: &str) -> VmValue {
805    VmValue::String(Rc::from(value))
806}
807
808fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
809    let mut map = BTreeMap::new();
810    for (key, value) in pairs {
811        map.insert((*key).to_string(), value.clone());
812    }
813    VmValue::Dict(Rc::new(map))
814}
815
816fn has_raw_model_tool_tag(content: &str) -> bool {
817    let lowered = content.to_ascii_lowercase();
818    lowered.contains("<tool_call")
819        || lowered.contains("<toolcall")
820        || lowered.contains("tool_code:")
821        || lowered.contains("tool_call:")
822        || lowered.contains("call:")
823        || lowered.contains("<function")
824}
825
826fn content_sample(response: &Value) -> Option<String> {
827    sample_content(&extract_content(response))
828}
829
830fn sample_content(content: &str) -> Option<String> {
831    let trimmed = content.trim();
832    if trimmed.is_empty() {
833        None
834    } else {
835        Some(trimmed.chars().take(240).collect())
836    }
837}
838
839fn sample_failure(text: &str, fallback: &str) -> String {
840    let trimmed = text.trim();
841    if trimmed.is_empty() {
842        fallback.to_string()
843    } else {
844        format!(
845            "{fallback}: {}",
846            trimmed.chars().take(240).collect::<String>()
847        )
848    }
849}
850
851fn first_non_empty(value: Option<String>, fallback: &str) -> String {
852    value
853        .filter(|value| !value.trim().is_empty())
854        .unwrap_or_else(|| fallback.to_string())
855}
856
857fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
858    clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864
865    #[test]
866    fn classify_openai_native_tool_call_as_pass() {
867        let report = classify_tool_conformance_fixture(
868            "local",
869            "model",
870            ToolProbeMode::NonStreaming,
871            DEFAULT_TOOL_PROBE_MARKER,
872            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
873        );
874        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
875        assert_eq!(
876            report.tool_calling.fallback_mode,
877            ToolProbeFallbackMode::Native
878        );
879        assert_eq!(
880            report.cases[0].classification,
881            ToolProbeClassification::StructuredNativeToolCall
882        );
883    }
884
885    #[test]
886    fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
887        let report = classify_tool_conformance_fixture(
888            "ollama",
889            "gemma4:26b",
890            ToolProbeMode::NonStreaming,
891            DEFAULT_TOOL_PROBE_MARKER,
892            r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
893        );
894        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
895        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
896        assert_eq!(
897            report.tool_calling.fallback_mode,
898            ToolProbeFallbackMode::Text
899        );
900        assert_eq!(
901            report.cases[0].classification,
902            ToolProbeClassification::ParseableHarnTextToolCall
903        );
904    }
905
906    #[test]
907    fn classify_qwen_call_colon_marker_as_text_fallback() {
908        let report = classify_tool_conformance_fixture(
909            "llamacpp",
910            "qwen",
911            ToolProbeMode::NonStreaming,
912            DEFAULT_TOOL_PROBE_MARKER,
913            r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
914        );
915        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
916        assert_eq!(
917            report.tool_calling.fallback_mode,
918            ToolProbeFallbackMode::Text
919        );
920    }
921
922    #[test]
923    fn classify_prose_only_as_disabled() {
924        let report = classify_tool_conformance_fixture(
925            "ollama",
926            "gemma4:26b",
927            ToolProbeMode::NonStreaming,
928            DEFAULT_TOOL_PROBE_MARKER,
929            r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
930        );
931        assert_eq!(
932            report.tool_calling.fallback_mode,
933            ToolProbeFallbackMode::Disabled
934        );
935        assert_eq!(
936            report.cases[0].classification,
937            ToolProbeClassification::ProseOnlyNonTool
938        );
939        assert_eq!(
940            report.cases[0].failure_reason.as_deref(),
941            Some("no_executable_tool_call")
942        );
943    }
944
945    #[test]
946    fn aggregates_openai_streaming_tool_call_deltas() {
947        let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
948                   data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
949                   data: [DONE]\n";
950        let response = aggregate_stream_text(raw, "local");
951        let case = classify_tool_probe_response(
952            ToolProbeMode::Streaming,
953            &response,
954            DEFAULT_TOOL_PROBE_MARKER,
955            None,
956            None,
957        );
958        assert!(case.ok, "{case:?}");
959        assert_eq!(
960            case.classification,
961            ToolProbeClassification::StructuredNativeToolCall
962        );
963    }
964
965    #[test]
966    fn report_satisfies_tool_probe_when_text_fallback_passes() {
967        let report = classify_tool_conformance_fixture(
968            "llamacpp",
969            "qwen",
970            ToolProbeMode::NonStreaming,
971            DEFAULT_TOOL_PROBE_MARKER,
972            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
973        );
974        assert!(report_satisfies_required_probe(&report, "tool_probe"));
975        assert!(!report_satisfies_required_probe(
976            &report,
977            "native_tool_probe"
978        ));
979    }
980}