Skip to main content

harn_vm/llm/
tool_conformance.rs

1//! One-tool provider conformance probe for local/runtime tool calling.
2//!
3//! The probe is deliberately tiny: define one harmless `echo_marker` tool,
4//! ask the model to call it with a fixed marker, and classify what came back.
5//! The classification is the stable contract eval harnesses consume; the live
6//! HTTP runner is a convenience around that classifier.
7
8use std::collections::BTreeMap;
9use std::rc::Rc;
10
11use serde::{Deserialize, Serialize};
12use serde_json::{json, Value};
13
14use crate::llm_config::{self, ProviderDef};
15use crate::value::VmValue;
16
17pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
18pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
19pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
20
21#[derive(Debug, Clone)]
22pub struct ToolConformanceProbeOptions {
23    pub provider: String,
24    pub model: String,
25    pub base_url: Option<String>,
26    pub modes: Vec<ToolProbeMode>,
27    pub marker: String,
28    pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32    pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33        Self {
34            provider: provider.into(),
35            model: model.into(),
36            base_url: None,
37            modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38            marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39            timeout_secs: 120,
40        }
41    }
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45#[serde(rename_all = "snake_case")]
46pub enum ToolProbeMode {
47    NonStreaming,
48    Streaming,
49}
50
51impl ToolProbeMode {
52    pub fn as_str(self) -> &'static str {
53        match self {
54            Self::NonStreaming => "non_streaming",
55            Self::Streaming => "streaming",
56        }
57    }
58}
59
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61#[serde(rename_all = "snake_case")]
62pub enum ToolProbeClassification {
63    StructuredNativeToolCall,
64    ParseableHarnTextToolCall,
65    RawModelToolTag,
66    ProseOnlyNonTool,
67    MalformedJsonArguments,
68    EmptySilent,
69    HttpError,
70    TransportError,
71}
72
73#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum ToolProbeStatus {
76    Pass,
77    Fail,
78    Unknown,
79}
80
81impl ToolProbeStatus {
82    pub fn as_str(&self) -> &'static str {
83        match self {
84            Self::Pass => "pass",
85            Self::Fail => "fail",
86            Self::Unknown => "unknown",
87        }
88    }
89}
90
91#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum ToolProbeFallbackMode {
94    Native,
95    Text,
96    Disabled,
97}
98
99impl ToolProbeFallbackMode {
100    pub fn as_str(&self) -> &'static str {
101        match self {
102            Self::Native => "native",
103            Self::Text => "text",
104            Self::Disabled => "disabled",
105        }
106    }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ToolConformanceReport {
111    pub schema_version: u32,
112    pub provider: String,
113    pub model: String,
114    #[serde(skip_serializing_if = "Option::is_none")]
115    pub base_url: Option<String>,
116    pub tool_name: String,
117    pub marker: String,
118    pub cases: Vec<ToolConformanceCase>,
119    pub tool_calling: ToolCallingConformanceSummary,
120}
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct ToolCallingConformanceSummary {
124    pub native: ToolProbeStatus,
125    pub text: ToolProbeStatus,
126    pub streaming_native: ToolProbeStatus,
127    pub fallback_mode: ToolProbeFallbackMode,
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub failure_reason: Option<String>,
130}
131
132#[derive(Debug, Clone, Serialize, Deserialize)]
133pub struct ToolConformanceCase {
134    pub mode: ToolProbeMode,
135    pub ok: bool,
136    pub classification: ToolProbeClassification,
137    pub fallback_mode: ToolProbeFallbackMode,
138    #[serde(skip_serializing_if = "Option::is_none")]
139    pub failure_reason: Option<String>,
140    #[serde(skip_serializing_if = "Option::is_none")]
141    pub http_status: Option<u16>,
142    #[serde(skip_serializing_if = "Option::is_none")]
143    pub elapsed_ms: Option<u64>,
144    pub native_tool_call_count: usize,
145    pub text_tool_call_count: usize,
146    #[serde(skip_serializing_if = "Vec::is_empty")]
147    pub parser_errors: Vec<String>,
148    #[serde(skip_serializing_if = "Vec::is_empty")]
149    pub protocol_violations: Vec<String>,
150    #[serde(skip_serializing_if = "Option::is_none")]
151    pub content_sample: Option<String>,
152}
153
154impl ToolConformanceCase {
155    fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
156        Self {
157            mode,
158            ok: false,
159            classification: ToolProbeClassification::TransportError,
160            fallback_mode: ToolProbeFallbackMode::Disabled,
161            failure_reason: Some(message),
162            http_status: None,
163            elapsed_ms,
164            native_tool_call_count: 0,
165            text_tool_call_count: 0,
166            parser_errors: Vec::new(),
167            protocol_violations: Vec::new(),
168            content_sample: None,
169        }
170    }
171
172    fn http_error(
173        mode: ToolProbeMode,
174        status: u16,
175        message: String,
176        elapsed_ms: Option<u64>,
177    ) -> Self {
178        Self {
179            mode,
180            ok: false,
181            classification: ToolProbeClassification::HttpError,
182            fallback_mode: ToolProbeFallbackMode::Disabled,
183            failure_reason: Some(message),
184            http_status: Some(status),
185            elapsed_ms,
186            native_tool_call_count: 0,
187            text_tool_call_count: 0,
188            parser_errors: Vec::new(),
189            protocol_violations: Vec::new(),
190            content_sample: None,
191        }
192    }
193}
194
195pub async fn run_tool_conformance_probe(
196    options: ToolConformanceProbeOptions,
197) -> ToolConformanceReport {
198    let model = llm_config::resolve_model_info(&options.model);
199    let provider = if options.provider.trim().is_empty() {
200        model.provider.clone()
201    } else {
202        options.provider.clone()
203    };
204    let model_id = model.id;
205    let base_url = options.base_url.clone().or_else(|| {
206        llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
207    });
208    let mut cases = Vec::new();
209    for mode in normalized_modes(&options.modes) {
210        cases.push(
211            execute_live_probe_case(
212                &provider,
213                &model_id,
214                base_url.as_deref(),
215                mode,
216                &options.marker,
217                options.timeout_secs,
218            )
219            .await,
220        );
221    }
222    report_from_cases(provider, model_id, base_url, options.marker, cases)
223}
224
225pub fn classify_tool_conformance_fixture(
226    provider: impl Into<String>,
227    model: impl Into<String>,
228    mode: ToolProbeMode,
229    marker: impl Into<String>,
230    raw: &str,
231) -> ToolConformanceReport {
232    let marker = marker.into();
233    let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
234    let case = classify_tool_probe_response(mode, &response, &marker, None, None);
235    report_from_cases(provider.into(), model.into(), None, marker, vec![case])
236}
237
238pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
239    match requirement {
240        "tool_probe" | "tool_call_probe" => {
241            report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
242                && report.cases.iter().any(|case| case.ok)
243        }
244        "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
245        "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
246        _ => false,
247    }
248}
249
250fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
251    if modes.is_empty() {
252        return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
253    }
254    let mut out = Vec::new();
255    for mode in modes {
256        if !out.contains(mode) {
257            out.push(*mode);
258        }
259    }
260    out
261}
262
263fn report_from_cases(
264    provider: String,
265    model: String,
266    base_url: Option<String>,
267    marker: String,
268    cases: Vec<ToolConformanceCase>,
269) -> ToolConformanceReport {
270    let summary = summarize_cases(&cases);
271    ToolConformanceReport {
272        schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
273        provider,
274        model,
275        base_url,
276        tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
277        marker,
278        cases,
279        tool_calling: summary,
280    }
281}
282
283fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
284    let mut native = ToolProbeStatus::Unknown;
285    let mut streaming_native = ToolProbeStatus::Unknown;
286    let mut text = ToolProbeStatus::Unknown;
287
288    for case in cases {
289        if case.classification == ToolProbeClassification::StructuredNativeToolCall {
290            if case.mode == ToolProbeMode::Streaming {
291                streaming_native = if case.ok {
292                    ToolProbeStatus::Pass
293                } else {
294                    ToolProbeStatus::Fail
295                };
296            } else {
297                native = if case.ok {
298                    ToolProbeStatus::Pass
299                } else {
300                    ToolProbeStatus::Fail
301                };
302            }
303        } else if case.mode == ToolProbeMode::Streaming
304            && streaming_native == ToolProbeStatus::Unknown
305        {
306            streaming_native = ToolProbeStatus::Fail;
307        } else if case.mode == ToolProbeMode::NonStreaming && native == ToolProbeStatus::Unknown {
308            native = ToolProbeStatus::Fail;
309        }
310
311        if case.classification == ToolProbeClassification::ParseableHarnTextToolCall {
312            text = if case.ok {
313                ToolProbeStatus::Pass
314            } else {
315                ToolProbeStatus::Fail
316            };
317        } else if text == ToolProbeStatus::Unknown && case.text_tool_call_count > 0 {
318            text = ToolProbeStatus::Fail;
319        }
320    }
321
322    let fallback_mode =
323        if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
324            ToolProbeFallbackMode::Native
325        } else if text == ToolProbeStatus::Pass {
326            ToolProbeFallbackMode::Text
327        } else {
328            ToolProbeFallbackMode::Disabled
329        };
330
331    let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
332        cases.iter().find_map(|case| case.failure_reason.clone())
333    } else {
334        None
335    };
336
337    ToolCallingConformanceSummary {
338        native,
339        text,
340        streaming_native,
341        fallback_mode,
342        failure_reason,
343    }
344}
345
346async fn execute_live_probe_case(
347    provider: &str,
348    model: &str,
349    base_url: Option<&str>,
350    mode: ToolProbeMode,
351    marker: &str,
352    timeout_secs: u64,
353) -> ToolConformanceCase {
354    let clock = harn_clock::RealClock::arc();
355    let started_ms = clock.monotonic_ms();
356    let Some(def) = llm_config::provider_config(provider) else {
357        return ToolConformanceCase::transport_error(
358            mode,
359            format!("unknown provider: {provider}"),
360            Some(elapsed_ms(&*clock, started_ms)),
361        );
362    };
363    let base_url = base_url
364        .filter(|value| !value.trim().is_empty())
365        .map(str::to_string)
366        .unwrap_or_else(|| llm_config::resolve_base_url(&def));
367    let url = match chat_url(&def, &base_url) {
368        Ok(url) => url,
369        Err(message) => {
370            return ToolConformanceCase::transport_error(
371                mode,
372                message,
373                Some(elapsed_ms(&*clock, started_ms)),
374            );
375        }
376    };
377    let body = probe_request_body(provider, model, mode, marker);
378    let client = if mode == ToolProbeMode::Streaming {
379        crate::llm::shared_streaming_client().clone()
380    } else {
381        crate::llm::shared_blocking_client().clone()
382    };
383    let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
384    let request = client
385        .post(&url)
386        .header("Content-Type", "application/json")
387        .timeout(std::time::Duration::from_secs(timeout_secs))
388        .json(&body);
389    let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
390    for (name, value) in &def.extra_headers {
391        request = request.header(name.as_str(), value.as_str());
392    }
393
394    let response = match request.send().await {
395        Ok(response) => response,
396        Err(error) => {
397            return ToolConformanceCase::transport_error(
398                mode,
399                format!("provider request failed: {error}"),
400                Some(elapsed_ms(&*clock, started_ms)),
401            );
402        }
403    };
404    let status = response.status();
405    let text = match response.text().await {
406        Ok(text) => text,
407        Err(error) => {
408            return ToolConformanceCase::transport_error(
409                mode,
410                format!("provider response was unreadable: {error}"),
411                Some(elapsed_ms(&*clock, started_ms)),
412            );
413        }
414    };
415    let elapsed = Some(elapsed_ms(&*clock, started_ms));
416    if !status.is_success() {
417        return ToolConformanceCase::http_error(
418            mode,
419            status.as_u16(),
420            sample_failure(&text, "provider returned non-success HTTP status"),
421            elapsed,
422        );
423    }
424    let response_value = if mode == ToolProbeMode::Streaming {
425        aggregate_stream_text(&text, provider)
426    } else {
427        serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
428    };
429    classify_tool_probe_response(
430        mode,
431        &response_value,
432        marker,
433        Some(status.as_u16()),
434        elapsed,
435    )
436}
437
438fn classify_tool_probe_response(
439    mode: ToolProbeMode,
440    response: &Value,
441    marker: &str,
442    http_status: Option<u16>,
443    elapsed_ms: Option<u64>,
444) -> ToolConformanceCase {
445    let native = extract_native_tool_calls(response);
446    let native_count = native.len();
447    let mut malformed_native = false;
448    for call in &native {
449        if call.name == TOOL_PROBE_TOOL_NAME {
450            match &call.arguments {
451                Some(Value::Object(map))
452                    if map.get("value").and_then(Value::as_str) == Some(marker) =>
453                {
454                    return ToolConformanceCase {
455                        mode,
456                        ok: true,
457                        classification: ToolProbeClassification::StructuredNativeToolCall,
458                        fallback_mode: ToolProbeFallbackMode::Native,
459                        failure_reason: None,
460                        http_status,
461                        elapsed_ms,
462                        native_tool_call_count: native_count,
463                        text_tool_call_count: 0,
464                        parser_errors: Vec::new(),
465                        protocol_violations: Vec::new(),
466                        content_sample: content_sample(response),
467                    };
468                }
469                Some(Value::Object(_)) => {}
470                _ => malformed_native = true,
471            }
472        }
473    }
474
475    let content = extract_content(response);
476    let tools = probe_tool_registry();
477    let parsed = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
478    let text_count = parsed.calls.len();
479    let text_pass = parsed.calls.iter().any(|call| {
480        call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
481            && call
482                .get("arguments")
483                .and_then(|args| args.get("value"))
484                .and_then(Value::as_str)
485                == Some(marker)
486    });
487    if text_pass {
488        return ToolConformanceCase {
489            mode,
490            ok: true,
491            classification: ToolProbeClassification::ParseableHarnTextToolCall,
492            fallback_mode: ToolProbeFallbackMode::Text,
493            failure_reason: None,
494            http_status,
495            elapsed_ms,
496            native_tool_call_count: native_count,
497            text_tool_call_count: text_count,
498            parser_errors: parsed.errors,
499            protocol_violations: parsed.violations,
500            content_sample: sample_content(&content),
501        };
502    }
503
504    let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
505        (
506            ToolProbeClassification::MalformedJsonArguments,
507            Some(first_non_empty(
508                parsed.errors.first().cloned(),
509                "malformed_tool_arguments",
510            )),
511        )
512    } else if content.trim().is_empty() && native_count == 0 {
513        (
514            ToolProbeClassification::EmptySilent,
515            Some("empty_silent_response".to_string()),
516        )
517    } else if has_raw_model_tool_tag(&content) {
518        (
519            ToolProbeClassification::RawModelToolTag,
520            Some("raw_tool_tag_no_structured_calls".to_string()),
521        )
522    } else {
523        (
524            ToolProbeClassification::ProseOnlyNonTool,
525            Some("no_executable_tool_call".to_string()),
526        )
527    };
528
529    ToolConformanceCase {
530        mode,
531        ok: false,
532        classification,
533        fallback_mode: ToolProbeFallbackMode::Disabled,
534        failure_reason,
535        http_status,
536        elapsed_ms,
537        native_tool_call_count: native_count,
538        text_tool_call_count: text_count,
539        parser_errors: parsed.errors,
540        protocol_violations: parsed.violations,
541        content_sample: sample_content(&content),
542    }
543}
544
545fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
546    let endpoint = if def.chat_endpoint.trim().is_empty() {
547        "/v1/chat/completions"
548    } else {
549        def.chat_endpoint.as_str()
550    };
551    let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
552        endpoint.to_string()
553    } else if endpoint.starts_with('/') {
554        format!("{}{}", base_url.trim_end_matches('/'), endpoint)
555    } else {
556        format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
557    };
558    reqwest::Url::parse(&url)
559        .map(|_| url.clone())
560        .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
561}
562
563fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
564    let prompt = format!(
565        "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
566    );
567    let tool = json!({
568        "type": "function",
569        "function": {
570            "name": TOOL_PROBE_TOOL_NAME,
571            "description": "Echo the probe marker exactly.",
572            "parameters": {
573                "type": "object",
574                "properties": {
575                    "value": {
576                        "type": "string",
577                        "description": "The marker value to echo."
578                    }
579                },
580                "required": ["value"],
581                "additionalProperties": false
582            }
583        }
584    });
585    let mut body = json!({
586        "model": model,
587        "messages": [{"role": "user", "content": prompt}],
588        "tools": [tool],
589        "stream": mode == ToolProbeMode::Streaming,
590        "temperature": 0,
591    });
592    if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
593        body["tool_choice"] = json!({
594            "type": "function",
595            "function": {"name": TOOL_PROBE_TOOL_NAME}
596        });
597    }
598    body
599}
600
601#[derive(Debug)]
602struct NativeToolCall {
603    name: String,
604    arguments: Option<Value>,
605}
606
607fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
608    let mut calls = Vec::new();
609    visit_native_tool_call_arrays(response, &mut calls);
610    calls
611}
612
613fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
614    match value {
615        Value::Object(map) => {
616            if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
617                for item in tool_calls {
618                    if let Some(call) = parse_native_tool_call(item) {
619                        calls.push(call);
620                    }
621                }
622            }
623            for child in map.values() {
624                visit_native_tool_call_arrays(child, calls);
625            }
626        }
627        Value::Array(items) => {
628            for item in items {
629                visit_native_tool_call_arrays(item, calls);
630            }
631        }
632        _ => {}
633    }
634}
635
636fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
637    let obj = item.as_object()?;
638    let function = obj.get("function").and_then(Value::as_object);
639    let name = function
640        .and_then(|function| function.get("name"))
641        .or_else(|| obj.get("name"))
642        .and_then(Value::as_str)?
643        .to_string();
644    let raw_args = function
645        .and_then(|function| function.get("arguments"))
646        .or_else(|| obj.get("arguments"));
647    let arguments = match raw_args {
648        Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
649        Some(value @ Value::Object(_)) => Some(value.clone()),
650        Some(_) => None,
651        None => Some(json!({})),
652    };
653    Some(NativeToolCall { name, arguments })
654}
655
656fn extract_content(response: &Value) -> String {
657    let mut parts = Vec::new();
658    visit_content(response, &mut parts);
659    parts
660        .into_iter()
661        .filter(|part| !part.trim().is_empty())
662        .collect::<Vec<_>>()
663        .join("\n")
664}
665
666fn visit_content(value: &Value, parts: &mut Vec<String>) {
667    match value {
668        Value::Object(map) => {
669            for key in ["content", "response", "text"] {
670                if let Some(text) = map.get(key).and_then(Value::as_str) {
671                    parts.push(text.to_string());
672                }
673            }
674            for child in map.values() {
675                visit_content(child, parts);
676            }
677        }
678        Value::Array(items) => {
679            for item in items {
680                visit_content(item, parts);
681            }
682        }
683        _ => {}
684    }
685}
686
687fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
688    let mut content = String::new();
689    let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
690    let mut frames = Vec::new();
691    for raw_line in text.lines() {
692        let line = raw_line.trim();
693        if line.is_empty() {
694            continue;
695        }
696        let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
697        if payload == "[DONE]" {
698            continue;
699        }
700        let Ok(frame) = serde_json::from_str::<Value>(payload) else {
701            continue;
702        };
703        collect_stream_content_and_calls(&frame, &mut content, &mut calls);
704        frames.push(frame);
705    }
706    let tool_calls: Vec<Value> = calls
707        .into_values()
708        .map(|call| {
709            json!({
710                "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
711                "type": "function",
712                "function": {
713                    "name": call.name.unwrap_or_default(),
714                    "arguments": call.arguments,
715                }
716            })
717        })
718        .collect();
719    json!({
720        "content": content,
721        "tool_calls": tool_calls,
722        "frames": frames,
723    })
724}
725
726#[derive(Debug, Default)]
727struct PartialStreamCall {
728    id: Option<String>,
729    name: Option<String>,
730    arguments: String,
731}
732
733fn collect_stream_content_and_calls(
734    frame: &Value,
735    content: &mut String,
736    calls: &mut BTreeMap<String, PartialStreamCall>,
737) {
738    if let Some(text) = frame
739        .pointer("/message/content")
740        .or_else(|| frame.pointer("/choices/0/delta/content"))
741        .or_else(|| frame.pointer("/choices/0/message/content"))
742        .or_else(|| frame.get("response"))
743        .and_then(Value::as_str)
744    {
745        content.push_str(text);
746    }
747    for item in frame
748        .pointer("/message/tool_calls")
749        .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
750        .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
751        .and_then(Value::as_array)
752        .into_iter()
753        .flatten()
754    {
755        let key = item
756            .get("index")
757            .and_then(Value::as_u64)
758            .map(|index| index.to_string())
759            .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
760            .unwrap_or_else(|| calls.len().to_string());
761        let slot = calls.entry(key).or_default();
762        if let Some(id) = item.get("id").and_then(Value::as_str) {
763            slot.id = Some(id.to_string());
764        }
765        if let Some(name) = item
766            .pointer("/function/name")
767            .or_else(|| item.get("name"))
768            .and_then(Value::as_str)
769        {
770            slot.name = Some(name.to_string());
771        }
772        if let Some(arguments) = item
773            .pointer("/function/arguments")
774            .or_else(|| item.get("arguments"))
775        {
776            match arguments {
777                Value::String(delta) => slot.arguments.push_str(delta),
778                Value::Object(_) => slot.arguments = arguments.to_string(),
779                _ => {}
780            }
781        }
782    }
783}
784
785fn probe_tool_registry() -> VmValue {
786    let mut value_param = BTreeMap::new();
787    value_param.insert("type".to_string(), vm_str("string"));
788    value_param.insert(
789        "description".to_string(),
790        vm_str("The marker value to echo."),
791    );
792    let mut params = BTreeMap::new();
793    params.insert("value".to_string(), VmValue::Dict(Rc::new(value_param)));
794    let tool = vm_dict(&[
795        ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
796        ("description", vm_str("Echo the probe marker exactly.")),
797        ("parameters", VmValue::Dict(Rc::new(params))),
798    ]);
799    vm_dict(&[("tools", VmValue::List(Rc::new(vec![tool])))])
800}
801
802fn vm_str(value: &str) -> VmValue {
803    VmValue::String(Rc::from(value))
804}
805
806fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
807    let mut map = BTreeMap::new();
808    for (key, value) in pairs {
809        map.insert((*key).to_string(), value.clone());
810    }
811    VmValue::Dict(Rc::new(map))
812}
813
814fn has_raw_model_tool_tag(content: &str) -> bool {
815    let lowered = content.to_ascii_lowercase();
816    lowered.contains("<tool_call")
817        || lowered.contains("<toolcall")
818        || lowered.contains("tool_code:")
819        || lowered.contains("tool_call:")
820        || lowered.contains("call:")
821        || lowered.contains("<function")
822}
823
824fn content_sample(response: &Value) -> Option<String> {
825    sample_content(&extract_content(response))
826}
827
828fn sample_content(content: &str) -> Option<String> {
829    let trimmed = content.trim();
830    if trimmed.is_empty() {
831        None
832    } else {
833        Some(trimmed.chars().take(240).collect())
834    }
835}
836
837fn sample_failure(text: &str, fallback: &str) -> String {
838    let trimmed = text.trim();
839    if trimmed.is_empty() {
840        fallback.to_string()
841    } else {
842        format!(
843            "{fallback}: {}",
844            trimmed.chars().take(240).collect::<String>()
845        )
846    }
847}
848
849fn first_non_empty(value: Option<String>, fallback: &str) -> String {
850    value
851        .filter(|value| !value.trim().is_empty())
852        .unwrap_or_else(|| fallback.to_string())
853}
854
855fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
856    clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
857}
858
859#[cfg(test)]
860mod tests {
861    use super::*;
862
863    #[test]
864    fn classify_openai_native_tool_call_as_pass() {
865        let report = classify_tool_conformance_fixture(
866            "local",
867            "model",
868            ToolProbeMode::NonStreaming,
869            DEFAULT_TOOL_PROBE_MARKER,
870            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
871        );
872        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
873        assert_eq!(
874            report.tool_calling.fallback_mode,
875            ToolProbeFallbackMode::Native
876        );
877        assert_eq!(
878            report.cases[0].classification,
879            ToolProbeClassification::StructuredNativeToolCall
880        );
881    }
882
883    #[test]
884    fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
885        let report = classify_tool_conformance_fixture(
886            "ollama",
887            "gemma4:26b",
888            ToolProbeMode::NonStreaming,
889            DEFAULT_TOOL_PROBE_MARKER,
890            r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
891        );
892        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
893        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
894        assert_eq!(
895            report.tool_calling.fallback_mode,
896            ToolProbeFallbackMode::Text
897        );
898        assert_eq!(
899            report.cases[0].classification,
900            ToolProbeClassification::ParseableHarnTextToolCall
901        );
902    }
903
904    #[test]
905    fn classify_qwen_call_colon_marker_as_text_fallback() {
906        let report = classify_tool_conformance_fixture(
907            "llamacpp",
908            "qwen",
909            ToolProbeMode::NonStreaming,
910            DEFAULT_TOOL_PROBE_MARKER,
911            r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
912        );
913        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
914        assert_eq!(
915            report.tool_calling.fallback_mode,
916            ToolProbeFallbackMode::Text
917        );
918    }
919
920    #[test]
921    fn classify_prose_only_as_disabled() {
922        let report = classify_tool_conformance_fixture(
923            "ollama",
924            "gemma4:26b",
925            ToolProbeMode::NonStreaming,
926            DEFAULT_TOOL_PROBE_MARKER,
927            r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
928        );
929        assert_eq!(
930            report.tool_calling.fallback_mode,
931            ToolProbeFallbackMode::Disabled
932        );
933        assert_eq!(
934            report.cases[0].classification,
935            ToolProbeClassification::ProseOnlyNonTool
936        );
937        assert_eq!(
938            report.cases[0].failure_reason.as_deref(),
939            Some("no_executable_tool_call")
940        );
941    }
942
943    #[test]
944    fn aggregates_openai_streaming_tool_call_deltas() {
945        let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
946                   data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
947                   data: [DONE]\n";
948        let response = aggregate_stream_text(raw, "local");
949        let case = classify_tool_probe_response(
950            ToolProbeMode::Streaming,
951            &response,
952            DEFAULT_TOOL_PROBE_MARKER,
953            None,
954            None,
955        );
956        assert!(case.ok, "{case:?}");
957        assert_eq!(
958            case.classification,
959            ToolProbeClassification::StructuredNativeToolCall
960        );
961    }
962
963    #[test]
964    fn report_satisfies_tool_probe_when_text_fallback_passes() {
965        let report = classify_tool_conformance_fixture(
966            "llamacpp",
967            "qwen",
968            ToolProbeMode::NonStreaming,
969            DEFAULT_TOOL_PROBE_MARKER,
970            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
971        );
972        assert!(report_satisfies_required_probe(&report, "tool_probe"));
973        assert!(!report_satisfies_required_probe(
974            &report,
975            "native_tool_probe"
976        ));
977    }
978}