Skip to main content

harn_vm/llm/
tool_conformance.rs

1//! One-tool provider conformance probe for local/runtime tool calling.
2//!
3//! The probe is deliberately tiny: define one harmless `echo_marker` tool,
4//! ask the model to call it with a fixed marker, and classify what came back.
5//! The classification is the stable contract eval harnesses consume; the live
6//! HTTP runner is a convenience around that classifier.
7
8use std::collections::BTreeMap;
9
10use serde::{Deserialize, Serialize};
11use serde_json::{json, Value};
12
13use crate::llm_config::{self, ProviderDef};
14use crate::value::VmValue;
15
16pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
17pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
18pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
19
20#[derive(Debug, Clone)]
21pub struct ToolConformanceProbeOptions {
22    pub provider: String,
23    pub model: String,
24    pub base_url: Option<String>,
25    pub modes: Vec<ToolProbeMode>,
26    pub marker: String,
27    pub repeat: usize,
28    pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32    pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33        Self {
34            provider: provider.into(),
35            model: model.into(),
36            base_url: None,
37            modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38            marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39            repeat: 1,
40            timeout_secs: 120,
41        }
42    }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ToolProbeMode {
48    NonStreaming,
49    Streaming,
50}
51
52impl ToolProbeMode {
53    pub fn as_str(self) -> &'static str {
54        match self {
55            Self::NonStreaming => "non_streaming",
56            Self::Streaming => "streaming",
57        }
58    }
59}
60
61#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum ToolProbeClassification {
64    StructuredNativeToolCall,
65    ParseableHarnTextToolCall,
66    RawModelToolTag,
67    ProseOnlyNonTool,
68    MalformedJsonArguments,
69    EmptySilent,
70    HttpError,
71    TransportError,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum ToolProbeStatus {
77    Pass,
78    Fail,
79    Unknown,
80}
81
82impl ToolProbeStatus {
83    pub fn as_str(&self) -> &'static str {
84        match self {
85            Self::Pass => "pass",
86            Self::Fail => "fail",
87            Self::Unknown => "unknown",
88        }
89    }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ToolProbeFallbackMode {
95    Native,
96    Text,
97    Disabled,
98}
99
100impl ToolProbeFallbackMode {
101    pub fn as_str(&self) -> &'static str {
102        match self {
103            Self::Native => "native",
104            Self::Text => "text",
105            Self::Disabled => "disabled",
106        }
107    }
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct ToolConformanceReport {
112    pub schema_version: u32,
113    pub provider: String,
114    pub model: String,
115    #[serde(skip_serializing_if = "Option::is_none")]
116    pub base_url: Option<String>,
117    pub tool_name: String,
118    pub marker: String,
119    pub cases: Vec<ToolConformanceCase>,
120    pub tool_calling: ToolCallingConformanceSummary,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct ToolCallingConformanceSummary {
125    pub native: ToolProbeStatus,
126    pub text: ToolProbeStatus,
127    pub streaming_native: ToolProbeStatus,
128    pub fallback_mode: ToolProbeFallbackMode,
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub failure_reason: Option<String>,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct ToolConformanceCase {
135    pub mode: ToolProbeMode,
136    pub ok: bool,
137    pub classification: ToolProbeClassification,
138    pub fallback_mode: ToolProbeFallbackMode,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub failure_reason: Option<String>,
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub http_status: Option<u16>,
143    #[serde(skip_serializing_if = "Option::is_none")]
144    pub elapsed_ms: Option<u64>,
145    pub native_tool_call_count: usize,
146    pub text_tool_call_count: usize,
147    #[serde(skip_serializing_if = "Vec::is_empty")]
148    pub parser_errors: Vec<String>,
149    #[serde(skip_serializing_if = "Vec::is_empty")]
150    pub protocol_violations: Vec<String>,
151    #[serde(skip_serializing_if = "Option::is_none")]
152    pub content_sample: Option<String>,
153}
154
155impl ToolConformanceCase {
156    fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
157        Self {
158            mode,
159            ok: false,
160            classification: ToolProbeClassification::TransportError,
161            fallback_mode: ToolProbeFallbackMode::Disabled,
162            failure_reason: Some(message),
163            http_status: None,
164            elapsed_ms,
165            native_tool_call_count: 0,
166            text_tool_call_count: 0,
167            parser_errors: Vec::new(),
168            protocol_violations: Vec::new(),
169            content_sample: None,
170        }
171    }
172
173    fn http_error(
174        mode: ToolProbeMode,
175        status: u16,
176        message: String,
177        elapsed_ms: Option<u64>,
178    ) -> Self {
179        Self {
180            mode,
181            ok: false,
182            classification: ToolProbeClassification::HttpError,
183            fallback_mode: ToolProbeFallbackMode::Disabled,
184            failure_reason: Some(message),
185            http_status: Some(status),
186            elapsed_ms,
187            native_tool_call_count: 0,
188            text_tool_call_count: 0,
189            parser_errors: Vec::new(),
190            protocol_violations: Vec::new(),
191            content_sample: None,
192        }
193    }
194}
195
196pub async fn run_tool_conformance_probe(
197    options: ToolConformanceProbeOptions,
198) -> ToolConformanceReport {
199    let model = llm_config::resolve_model_info(&options.model);
200    let provider = if options.provider.trim().is_empty() {
201        model.provider.clone()
202    } else {
203        options.provider.clone()
204    };
205    let model_id = resolved_probe_model_id(&model.id);
206    let base_url = options.base_url.clone().or_else(|| {
207        llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
208    });
209    let mut cases = Vec::new();
210    let modes = normalized_modes(&options.modes);
211    for _ in 0..options.repeat.max(1) {
212        for mode in &modes {
213            cases.push(
214                execute_live_probe_case(
215                    &provider,
216                    &model_id,
217                    base_url.as_deref(),
218                    *mode,
219                    &options.marker,
220                    options.timeout_secs,
221                )
222                .await,
223            );
224        }
225    }
226    report_from_cases(provider, model_id, base_url, options.marker, cases)
227}
228
229fn resolved_probe_model_id(selector: &str) -> String {
230    llm_config::wire_model_id(selector)
231}
232
233pub fn classify_tool_conformance_fixture(
234    provider: impl Into<String>,
235    model: impl Into<String>,
236    mode: ToolProbeMode,
237    marker: impl Into<String>,
238    raw: &str,
239) -> ToolConformanceReport {
240    let marker = marker.into();
241    let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
242    let case = classify_tool_probe_response(mode, &response, &marker, None, None);
243    report_from_cases(provider.into(), model.into(), None, marker, vec![case])
244}
245
246pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
247    match requirement {
248        "tool_probe" | "tool_call_probe" => {
249            report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
250                && report.cases.iter().any(|case| case.ok)
251        }
252        "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
253        "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
254        _ => false,
255    }
256}
257
258fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
259    if modes.is_empty() {
260        return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
261    }
262    let mut out = Vec::new();
263    for mode in modes {
264        if !out.contains(mode) {
265            out.push(*mode);
266        }
267    }
268    out
269}
270
271fn report_from_cases(
272    provider: String,
273    model: String,
274    base_url: Option<String>,
275    marker: String,
276    cases: Vec<ToolConformanceCase>,
277) -> ToolConformanceReport {
278    let summary = summarize_cases(&cases);
279    ToolConformanceReport {
280        schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
281        provider,
282        model,
283        base_url,
284        tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
285        marker,
286        cases,
287        tool_calling: summary,
288    }
289}
290
291fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
292    let native = summarize_native_mode(cases, ToolProbeMode::NonStreaming);
293    let streaming_native = summarize_native_mode(cases, ToolProbeMode::Streaming);
294    let text = summarize_text_mode(cases);
295
296    let fallback_mode =
297        if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
298            ToolProbeFallbackMode::Native
299        } else if text == ToolProbeStatus::Pass {
300            ToolProbeFallbackMode::Text
301        } else {
302            ToolProbeFallbackMode::Disabled
303        };
304
305    let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
306        cases.iter().find_map(|case| case.failure_reason.clone())
307    } else {
308        None
309    };
310
311    ToolCallingConformanceSummary {
312        native,
313        text,
314        streaming_native,
315        fallback_mode,
316        failure_reason,
317    }
318}
319
320fn summarize_native_mode(cases: &[ToolConformanceCase], mode: ToolProbeMode) -> ToolProbeStatus {
321    let mut saw_mode = false;
322    let mut all_passed = true;
323    for case in cases.iter().filter(|case| case.mode == mode) {
324        saw_mode = true;
325        if !(case.ok && case.classification == ToolProbeClassification::StructuredNativeToolCall) {
326            all_passed = false;
327        }
328    }
329    match (saw_mode, all_passed) {
330        (false, _) => ToolProbeStatus::Unknown,
331        (true, true) => ToolProbeStatus::Pass,
332        (true, false) => ToolProbeStatus::Fail,
333    }
334}
335
336fn summarize_text_mode(cases: &[ToolConformanceCase]) -> ToolProbeStatus {
337    let mut saw_text = false;
338    let mut saw_passing_mode = false;
339    for mode in [ToolProbeMode::NonStreaming, ToolProbeMode::Streaming] {
340        let mut saw_mode = false;
341        let mut saw_text_in_mode = false;
342        let mut all_mode_cases_passed = true;
343        for case in cases.iter().filter(|case| case.mode == mode) {
344            saw_mode = true;
345            saw_text_in_mode |= case.classification
346                == ToolProbeClassification::ParseableHarnTextToolCall
347                || case.text_tool_call_count > 0;
348            if !(case.ok
349                && case.classification == ToolProbeClassification::ParseableHarnTextToolCall)
350            {
351                all_mode_cases_passed = false;
352            }
353        }
354        saw_text |= saw_text_in_mode;
355        if saw_mode && saw_text_in_mode && all_mode_cases_passed {
356            saw_passing_mode = true;
357        }
358    }
359    if !saw_text {
360        return ToolProbeStatus::Unknown;
361    }
362    if saw_passing_mode {
363        ToolProbeStatus::Pass
364    } else {
365        ToolProbeStatus::Fail
366    }
367}
368
369async fn execute_live_probe_case(
370    provider: &str,
371    model: &str,
372    base_url: Option<&str>,
373    mode: ToolProbeMode,
374    marker: &str,
375    timeout_secs: u64,
376) -> ToolConformanceCase {
377    let clock = harn_clock::RealClock::arc();
378    let started_ms = clock.monotonic_ms();
379    let Some(def) = llm_config::provider_config(provider) else {
380        return ToolConformanceCase::transport_error(
381            mode,
382            format!("unknown provider: {provider}"),
383            Some(elapsed_ms(&*clock, started_ms)),
384        );
385    };
386    let base_url = base_url
387        .filter(|value| !value.trim().is_empty())
388        .map(str::to_string)
389        .unwrap_or_else(|| llm_config::resolve_base_url(&def));
390    let url = match chat_url(&def, &base_url) {
391        Ok(url) => url,
392        Err(message) => {
393            return ToolConformanceCase::transport_error(
394                mode,
395                message,
396                Some(elapsed_ms(&*clock, started_ms)),
397            );
398        }
399    };
400    let body = probe_request_body(provider, model, mode, marker);
401    let client = if mode == ToolProbeMode::Streaming {
402        crate::llm::shared_streaming_client().clone()
403    } else {
404        crate::llm::shared_blocking_client().clone()
405    };
406    let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
407    let request = client
408        .post(&url)
409        .header("Content-Type", "application/json")
410        .timeout(std::time::Duration::from_secs(timeout_secs))
411        .json(&body);
412    let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
413    for (name, value) in &def.extra_headers {
414        request = request.header(name.as_str(), value.as_str());
415    }
416
417    let response = match request.send().await {
418        Ok(response) => response,
419        Err(error) => {
420            return ToolConformanceCase::transport_error(
421                mode,
422                format!("provider request failed: {error}"),
423                Some(elapsed_ms(&*clock, started_ms)),
424            );
425        }
426    };
427    let status = response.status();
428    let text = match response.text().await {
429        Ok(text) => text,
430        Err(error) => {
431            return ToolConformanceCase::transport_error(
432                mode,
433                format!("provider response was unreadable: {error}"),
434                Some(elapsed_ms(&*clock, started_ms)),
435            );
436        }
437    };
438    let elapsed = Some(elapsed_ms(&*clock, started_ms));
439    if !status.is_success() {
440        return ToolConformanceCase::http_error(
441            mode,
442            status.as_u16(),
443            sample_failure(&text, "provider returned non-success HTTP status"),
444            elapsed,
445        );
446    }
447    let response_value = if mode == ToolProbeMode::Streaming {
448        aggregate_stream_text(&text, provider)
449    } else {
450        serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
451    };
452    classify_tool_probe_response(
453        mode,
454        &response_value,
455        marker,
456        Some(status.as_u16()),
457        elapsed,
458    )
459}
460
461/// True when `calls` contains the probe's echo_marker call (the
462/// `TOOL_PROBE_TOOL_NAME` tool with `args.value == marker`). Shared by the
463/// tagged and fenced-JSON text-channel parse attempts.
464fn probe_marker_present(calls: &[Value], marker: &str) -> bool {
465    calls.iter().any(|call| {
466        call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
467            && call
468                .get("arguments")
469                .and_then(|args| args.get("value"))
470                .and_then(Value::as_str)
471                == Some(marker)
472    })
473}
474
475fn classify_tool_probe_response(
476    mode: ToolProbeMode,
477    response: &Value,
478    marker: &str,
479    http_status: Option<u16>,
480    elapsed_ms: Option<u64>,
481) -> ToolConformanceCase {
482    let native = extract_native_tool_calls(response);
483    let native_count = native.len();
484    let mut malformed_native = false;
485    for call in &native {
486        if call.name == TOOL_PROBE_TOOL_NAME {
487            match &call.arguments {
488                Some(Value::Object(map))
489                    if map.get("value").and_then(Value::as_str) == Some(marker) =>
490                {
491                    return ToolConformanceCase {
492                        mode,
493                        ok: true,
494                        classification: ToolProbeClassification::StructuredNativeToolCall,
495                        fallback_mode: ToolProbeFallbackMode::Native,
496                        failure_reason: None,
497                        http_status,
498                        elapsed_ms,
499                        native_tool_call_count: native_count,
500                        text_tool_call_count: 0,
501                        parser_errors: Vec::new(),
502                        protocol_violations: Vec::new(),
503                        content_sample: content_sample(response),
504                    };
505                }
506                Some(Value::Object(_)) => {}
507                _ => malformed_native = true,
508            }
509        }
510    }
511
512    let content = extract_content(response);
513    let tools = probe_tool_registry();
514    // Try the canonical tagged/heredoc grammar first; if it does not yield the
515    // echo_marker call, also try the fenced-JSON grammar. A fenced-JSON
516    // emission that parses to the probe call still classifies as
517    // ParseableHarnTextToolCall (it is a text-channel format) — the taxonomy is
518    // unchanged, only the body grammar the text path accepts is extended.
519    let tagged = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
520    let parsed = if probe_marker_present(&tagged.calls, marker) {
521        tagged
522    } else {
523        let fenced = crate::llm::tools::parse_fenced_json_tool_calls(&content);
524        if probe_marker_present(&fenced.calls, marker) {
525            fenced
526        } else {
527            tagged
528        }
529    };
530    let text_count = parsed.calls.len();
531    let text_pass = probe_marker_present(&parsed.calls, marker);
532    if text_pass {
533        return ToolConformanceCase {
534            mode,
535            ok: true,
536            classification: ToolProbeClassification::ParseableHarnTextToolCall,
537            fallback_mode: ToolProbeFallbackMode::Text,
538            failure_reason: None,
539            http_status,
540            elapsed_ms,
541            native_tool_call_count: native_count,
542            text_tool_call_count: text_count,
543            parser_errors: parsed.errors,
544            protocol_violations: parsed.violations,
545            content_sample: sample_content(&content),
546        };
547    }
548
549    let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
550        (
551            ToolProbeClassification::MalformedJsonArguments,
552            Some(first_non_empty(
553                parsed.errors.first().cloned(),
554                "malformed_tool_arguments",
555            )),
556        )
557    } else if content.trim().is_empty() && native_count == 0 {
558        (
559            ToolProbeClassification::EmptySilent,
560            Some("empty_silent_response".to_string()),
561        )
562    } else if has_raw_model_tool_tag(&content) {
563        (
564            ToolProbeClassification::RawModelToolTag,
565            Some("raw_tool_tag_no_structured_calls".to_string()),
566        )
567    } else {
568        (
569            ToolProbeClassification::ProseOnlyNonTool,
570            Some("no_executable_tool_call".to_string()),
571        )
572    };
573
574    ToolConformanceCase {
575        mode,
576        ok: false,
577        classification,
578        fallback_mode: ToolProbeFallbackMode::Disabled,
579        failure_reason,
580        http_status,
581        elapsed_ms,
582        native_tool_call_count: native_count,
583        text_tool_call_count: text_count,
584        parser_errors: parsed.errors,
585        protocol_violations: parsed.violations,
586        content_sample: sample_content(&content),
587    }
588}
589
590fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
591    let endpoint = if def.chat_endpoint.trim().is_empty() {
592        "/v1/chat/completions"
593    } else {
594        def.chat_endpoint.as_str()
595    };
596    let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
597        endpoint.to_string()
598    } else if endpoint.starts_with('/') {
599        format!("{}{}", base_url.trim_end_matches('/'), endpoint)
600    } else {
601        format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
602    };
603    reqwest::Url::parse(&url)
604        .map(|_| url.clone())
605        .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
606}
607
608fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
609    let prompt = format!(
610        "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
611    );
612    let tool = json!({
613        "type": "function",
614        "function": {
615            "name": TOOL_PROBE_TOOL_NAME,
616            "description": "Echo the probe marker exactly.",
617            "parameters": {
618                "type": "object",
619                "properties": {
620                    "value": {
621                        "type": "string",
622                        "description": "The marker value to echo."
623                    }
624                },
625                "required": ["value"],
626                "additionalProperties": false
627            }
628        }
629    });
630    let mut body = json!({
631        "model": model,
632        "messages": [{"role": "user", "content": prompt}],
633        "tools": [tool],
634        "stream": mode == ToolProbeMode::Streaming,
635        "temperature": 0,
636    });
637    if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
638        body["tool_choice"] = json!({
639            "type": "function",
640            "function": {"name": TOOL_PROBE_TOOL_NAME}
641        });
642    }
643    body
644}
645
646#[derive(Debug)]
647struct NativeToolCall {
648    name: String,
649    arguments: Option<Value>,
650}
651
652fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
653    let mut calls = Vec::new();
654    visit_native_tool_call_arrays(response, &mut calls);
655    calls
656}
657
658fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
659    match value {
660        Value::Object(map) => {
661            if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
662                for item in tool_calls {
663                    if let Some(call) = parse_native_tool_call(item) {
664                        calls.push(call);
665                    }
666                }
667            }
668            for child in map.values() {
669                visit_native_tool_call_arrays(child, calls);
670            }
671        }
672        Value::Array(items) => {
673            for item in items {
674                visit_native_tool_call_arrays(item, calls);
675            }
676        }
677        _ => {}
678    }
679}
680
681fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
682    let obj = item.as_object()?;
683    let function = obj.get("function").and_then(Value::as_object);
684    let name = function
685        .and_then(|function| function.get("name"))
686        .or_else(|| obj.get("name"))
687        .and_then(Value::as_str)?
688        .to_string();
689    match crate::llm::tools::parse_text_tool_call_from_native_name(&name) {
690        crate::llm::tools::NativeToolNameTextCall::Parsed { name, arguments } => {
691            return Some(NativeToolCall {
692                name,
693                arguments: Some(arguments),
694            });
695        }
696        crate::llm::tools::NativeToolNameTextCall::Malformed { name, .. } => {
697            return Some(NativeToolCall {
698                name,
699                arguments: None,
700            });
701        }
702        crate::llm::tools::NativeToolNameTextCall::NotCall => {}
703    }
704    let raw_args = function
705        .and_then(|function| function.get("arguments"))
706        .or_else(|| obj.get("arguments"));
707    let arguments = match raw_args {
708        Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
709        Some(value @ Value::Object(_)) => Some(value.clone()),
710        Some(_) => None,
711        None => Some(json!({})),
712    };
713    Some(NativeToolCall { name, arguments })
714}
715
716fn extract_content(response: &Value) -> String {
717    let mut parts = Vec::new();
718    visit_content(response, &mut parts);
719    parts
720        .into_iter()
721        .filter(|part| !part.trim().is_empty())
722        .collect::<Vec<_>>()
723        .join("\n")
724}
725
726fn visit_content(value: &Value, parts: &mut Vec<String>) {
727    match value {
728        Value::Object(map) => {
729            for key in ["content", "response", "text"] {
730                if let Some(text) = map.get(key).and_then(Value::as_str) {
731                    parts.push(text.to_string());
732                }
733            }
734            for child in map.values() {
735                visit_content(child, parts);
736            }
737        }
738        Value::Array(items) => {
739            for item in items {
740                visit_content(item, parts);
741            }
742        }
743        _ => {}
744    }
745}
746
747fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
748    let mut content = String::new();
749    let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
750    let mut frames = Vec::new();
751    for raw_line in text.lines() {
752        let line = raw_line.trim();
753        if line.is_empty() {
754            continue;
755        }
756        let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
757        if payload == "[DONE]" {
758            continue;
759        }
760        let Ok(frame) = serde_json::from_str::<Value>(payload) else {
761            continue;
762        };
763        collect_stream_content_and_calls(&frame, &mut content, &mut calls);
764        frames.push(frame);
765    }
766    let tool_calls: Vec<Value> = calls
767        .into_values()
768        .map(|call| {
769            json!({
770                "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
771                "type": "function",
772                "function": {
773                    "name": call.name.unwrap_or_default(),
774                    "arguments": call.arguments,
775                }
776            })
777        })
778        .collect();
779    json!({
780        "content": content,
781        "tool_calls": tool_calls,
782        "frames": frames,
783    })
784}
785
786#[derive(Debug, Default)]
787struct PartialStreamCall {
788    id: Option<String>,
789    name: Option<String>,
790    arguments: String,
791}
792
793fn collect_stream_content_and_calls(
794    frame: &Value,
795    content: &mut String,
796    calls: &mut BTreeMap<String, PartialStreamCall>,
797) {
798    if let Some(text) = frame
799        .pointer("/message/content")
800        .or_else(|| frame.pointer("/choices/0/delta/content"))
801        .or_else(|| frame.pointer("/choices/0/message/content"))
802        .or_else(|| frame.get("response"))
803        .and_then(Value::as_str)
804    {
805        content.push_str(text);
806    }
807    for item in frame
808        .pointer("/message/tool_calls")
809        .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
810        .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
811        .and_then(Value::as_array)
812        .into_iter()
813        .flatten()
814    {
815        let key = item
816            .get("index")
817            .and_then(Value::as_u64)
818            .map(|index| index.to_string())
819            .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
820            .unwrap_or_else(|| calls.len().to_string());
821        let slot = calls.entry(key).or_default();
822        if let Some(id) = item.get("id").and_then(Value::as_str) {
823            slot.id = Some(id.to_string());
824        }
825        if let Some(name) = item
826            .pointer("/function/name")
827            .or_else(|| item.get("name"))
828            .and_then(Value::as_str)
829        {
830            slot.name = Some(name.to_string());
831        }
832        if let Some(arguments) = item
833            .pointer("/function/arguments")
834            .or_else(|| item.get("arguments"))
835        {
836            match arguments {
837                Value::String(delta) => slot.arguments.push_str(delta),
838                Value::Object(_) => slot.arguments = arguments.to_string(),
839                _ => {}
840            }
841        }
842    }
843}
844
845fn probe_tool_registry() -> VmValue {
846    let mut value_param = BTreeMap::new();
847    value_param.insert("type".to_string(), vm_str("string"));
848    value_param.insert(
849        "description".to_string(),
850        vm_str("The marker value to echo."),
851    );
852    let mut params = BTreeMap::new();
853    params.insert("value".to_string(), VmValue::dict(value_param));
854    let tool = vm_dict(&[
855        ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
856        ("description", vm_str("Echo the probe marker exactly.")),
857        ("parameters", VmValue::dict(params)),
858    ]);
859    vm_dict(&[("tools", VmValue::List(std::sync::Arc::new(vec![tool])))])
860}
861
862fn vm_str(value: &str) -> VmValue {
863    VmValue::String(arcstr::ArcStr::from(value))
864}
865
866fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
867    let mut map = BTreeMap::new();
868    for (key, value) in pairs {
869        map.insert((*key).to_string(), value.clone());
870    }
871    VmValue::dict(map)
872}
873
874fn has_raw_model_tool_tag(content: &str) -> bool {
875    let lowered = content.to_ascii_lowercase();
876    lowered.contains("<tool_call")
877        || lowered.contains("<toolcall")
878        || lowered.contains("tool_code:")
879        || lowered.contains("tool_call:")
880        || lowered.contains("call:")
881        || lowered.contains("<function")
882}
883
884fn content_sample(response: &Value) -> Option<String> {
885    sample_content(&extract_content(response))
886}
887
888fn sample_content(content: &str) -> Option<String> {
889    let trimmed = content.trim();
890    if trimmed.is_empty() {
891        None
892    } else {
893        Some(trimmed.chars().take(240).collect())
894    }
895}
896
897fn sample_failure(text: &str, fallback: &str) -> String {
898    let trimmed = text.trim();
899    if trimmed.is_empty() {
900        fallback.to_string()
901    } else {
902        format!(
903            "{fallback}: {}",
904            trimmed.chars().take(240).collect::<String>()
905        )
906    }
907}
908
909fn first_non_empty(value: Option<String>, fallback: &str) -> String {
910    value
911        .filter(|value| !value.trim().is_empty())
912        .unwrap_or_else(|| fallback.to_string())
913}
914
915fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
916    clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
917}
918
919#[cfg(test)]
920mod tests {
921    use super::*;
922
923    #[test]
924    fn probe_resolves_catalog_key_to_provider_wire_model() {
925        let resolved = llm_config::resolve_model_info("baseten-glm-5.2");
926        assert_eq!(resolved_probe_model_id(&resolved.id), "zai-org/GLM-5.2");
927    }
928
929    #[test]
930    fn classify_openai_native_tool_call_as_pass() {
931        let report = classify_tool_conformance_fixture(
932            "local",
933            "model",
934            ToolProbeMode::NonStreaming,
935            DEFAULT_TOOL_PROBE_MARKER,
936            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
937        );
938        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
939        assert_eq!(
940            report.tool_calling.fallback_mode,
941            ToolProbeFallbackMode::Native
942        );
943        assert_eq!(
944            report.cases[0].classification,
945            ToolProbeClassification::StructuredNativeToolCall
946        );
947    }
948
949    #[test]
950    fn classify_native_tool_call_with_text_call_in_name_as_pass() {
951        let report = classify_tool_conformance_fixture(
952            "zai",
953            "glm-5",
954            ToolProbeMode::NonStreaming,
955            DEFAULT_TOOL_PROBE_MARKER,
956            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker({ value: \"harn_tool_probe_marker\" })</arg_value>","arguments":"{}"}}]}}]}"#,
957        );
958
959        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
960        assert_eq!(
961            report.tool_calling.fallback_mode,
962            ToolProbeFallbackMode::Native
963        );
964        assert_eq!(
965            report.cases[0].classification,
966            ToolProbeClassification::StructuredNativeToolCall
967        );
968    }
969
970    #[test]
971    fn classify_partial_text_call_in_native_name_as_malformed() {
972        let report = classify_tool_conformance_fixture(
973            "zai",
974            "glm-5",
975            ToolProbeMode::NonStreaming,
976            DEFAULT_TOOL_PROBE_MARKER,
977            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker({ value: <<EOF","arguments":"{"}}]}}]}"#,
978        );
979
980        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
981        assert_eq!(
982            report.cases[0].classification,
983            ToolProbeClassification::MalformedJsonArguments
984        );
985    }
986
987    #[test]
988    fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
989        let report = classify_tool_conformance_fixture(
990            "ollama",
991            "gemma4:26b",
992            ToolProbeMode::NonStreaming,
993            DEFAULT_TOOL_PROBE_MARKER,
994            r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
995        );
996        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
997        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
998        assert_eq!(
999            report.tool_calling.fallback_mode,
1000            ToolProbeFallbackMode::Text
1001        );
1002        assert_eq!(
1003            report.cases[0].classification,
1004            ToolProbeClassification::ParseableHarnTextToolCall
1005        );
1006    }
1007
1008    #[test]
1009    fn classify_qwen_call_colon_marker_as_text_fallback() {
1010        let report = classify_tool_conformance_fixture(
1011            "llamacpp",
1012            "qwen",
1013            ToolProbeMode::NonStreaming,
1014            DEFAULT_TOOL_PROBE_MARKER,
1015            r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
1016        );
1017        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
1018        assert_eq!(
1019            report.tool_calling.fallback_mode,
1020            ToolProbeFallbackMode::Text
1021        );
1022    }
1023
1024    #[test]
1025    fn classify_prose_only_as_disabled() {
1026        let report = classify_tool_conformance_fixture(
1027            "ollama",
1028            "gemma4:26b",
1029            ToolProbeMode::NonStreaming,
1030            DEFAULT_TOOL_PROBE_MARKER,
1031            r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
1032        );
1033        assert_eq!(
1034            report.tool_calling.fallback_mode,
1035            ToolProbeFallbackMode::Disabled
1036        );
1037        assert_eq!(
1038            report.cases[0].classification,
1039            ToolProbeClassification::ProseOnlyNonTool
1040        );
1041        assert_eq!(
1042            report.cases[0].failure_reason.as_deref(),
1043            Some("no_executable_tool_call")
1044        );
1045    }
1046
1047    #[test]
1048    fn aggregates_openai_streaming_tool_call_deltas() {
1049        let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
1050                   data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
1051                   data: [DONE]\n";
1052        let response = aggregate_stream_text(raw, "local");
1053        let case = classify_tool_probe_response(
1054            ToolProbeMode::Streaming,
1055            &response,
1056            DEFAULT_TOOL_PROBE_MARKER,
1057            None,
1058            None,
1059        );
1060        assert!(case.ok, "{case:?}");
1061        assert_eq!(
1062            case.classification,
1063            ToolProbeClassification::StructuredNativeToolCall
1064        );
1065    }
1066
1067    #[test]
1068    fn report_satisfies_tool_probe_when_text_fallback_passes() {
1069        let report = classify_tool_conformance_fixture(
1070            "llamacpp",
1071            "qwen",
1072            ToolProbeMode::NonStreaming,
1073            DEFAULT_TOOL_PROBE_MARKER,
1074            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
1075        );
1076        assert!(report_satisfies_required_probe(&report, "tool_probe"));
1077        assert!(!report_satisfies_required_probe(
1078            &report,
1079            "native_tool_probe"
1080        ));
1081    }
1082
1083    #[test]
1084    fn summary_requires_every_repeated_native_case_to_pass() {
1085        let summary = summarize_cases(&[
1086            probe_case(
1087                ToolProbeMode::NonStreaming,
1088                true,
1089                ToolProbeClassification::StructuredNativeToolCall,
1090            ),
1091            probe_case(
1092                ToolProbeMode::NonStreaming,
1093                false,
1094                ToolProbeClassification::ProseOnlyNonTool,
1095            ),
1096        ]);
1097        assert_eq!(summary.native, ToolProbeStatus::Fail);
1098        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1099    }
1100
1101    #[test]
1102    fn summary_requires_every_repeated_text_case_to_pass() {
1103        let summary = summarize_cases(&[
1104            probe_case(
1105                ToolProbeMode::NonStreaming,
1106                true,
1107                ToolProbeClassification::ParseableHarnTextToolCall,
1108            ),
1109            probe_case(
1110                ToolProbeMode::NonStreaming,
1111                false,
1112                ToolProbeClassification::MalformedJsonArguments,
1113            ),
1114        ]);
1115        assert_eq!(summary.native, ToolProbeStatus::Fail);
1116        assert_eq!(summary.text, ToolProbeStatus::Fail);
1117        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1118    }
1119
1120    #[test]
1121    fn summary_preserves_nonstreaming_text_fallback_when_streaming_fails() {
1122        let summary = summarize_cases(&[
1123            probe_case(
1124                ToolProbeMode::NonStreaming,
1125                true,
1126                ToolProbeClassification::ParseableHarnTextToolCall,
1127            ),
1128            probe_case(
1129                ToolProbeMode::Streaming,
1130                false,
1131                ToolProbeClassification::ProseOnlyNonTool,
1132            ),
1133        ]);
1134        assert_eq!(summary.native, ToolProbeStatus::Fail);
1135        assert_eq!(summary.streaming_native, ToolProbeStatus::Fail);
1136        assert_eq!(summary.text, ToolProbeStatus::Pass);
1137        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Text);
1138    }
1139
1140    fn probe_case(
1141        mode: ToolProbeMode,
1142        ok: bool,
1143        classification: ToolProbeClassification,
1144    ) -> ToolConformanceCase {
1145        let native_tool_call_count =
1146            usize::from(classification == ToolProbeClassification::StructuredNativeToolCall);
1147        let text_tool_call_count =
1148            usize::from(classification == ToolProbeClassification::ParseableHarnTextToolCall);
1149        ToolConformanceCase {
1150            mode,
1151            ok,
1152            classification,
1153            fallback_mode: ToolProbeFallbackMode::Disabled,
1154            failure_reason: None,
1155            http_status: None,
1156            elapsed_ms: None,
1157            native_tool_call_count,
1158            text_tool_call_count,
1159            parser_errors: Vec::new(),
1160            protocol_violations: Vec::new(),
1161            content_sample: None,
1162        }
1163    }
1164}