Skip to main content

harn_vm/llm/
tool_conformance.rs

1//! One-tool provider conformance probe for local/runtime tool calling.
2//!
3//! The probe is deliberately tiny: define one harmless `echo_marker` tool,
4//! ask the model to call it with a fixed marker, and classify what came back.
5//! The classification is the stable contract eval harnesses consume; the live
6//! HTTP runner is a convenience around that classifier.
7
8use std::collections::BTreeMap;
9
10use serde::{Deserialize, Serialize};
11use serde_json::{json, Value};
12
13use crate::llm_config::{self, ProviderDef};
14use crate::value::VmValue;
15
16pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
17pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
18pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
19
20#[derive(Debug, Clone)]
21pub struct ToolConformanceProbeOptions {
22    pub provider: String,
23    pub model: String,
24    pub base_url: Option<String>,
25    pub modes: Vec<ToolProbeMode>,
26    pub marker: String,
27    pub repeat: usize,
28    pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32    pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33        Self {
34            provider: provider.into(),
35            model: model.into(),
36            base_url: None,
37            modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38            marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39            repeat: 1,
40            timeout_secs: 120,
41        }
42    }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ToolProbeMode {
48    NonStreaming,
49    Streaming,
50}
51
52impl ToolProbeMode {
53    pub fn as_str(self) -> &'static str {
54        match self {
55            Self::NonStreaming => "non_streaming",
56            Self::Streaming => "streaming",
57        }
58    }
59}
60
61#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum ToolProbeClassification {
64    StructuredNativeToolCall,
65    ParseableHarnTextToolCall,
66    RawModelToolTag,
67    ProseOnlyNonTool,
68    MalformedJsonArguments,
69    EmptySilent,
70    HttpError,
71    TransportError,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum ToolProbeStatus {
77    Pass,
78    Fail,
79    Unknown,
80}
81
82impl ToolProbeStatus {
83    pub fn as_str(&self) -> &'static str {
84        match self {
85            Self::Pass => "pass",
86            Self::Fail => "fail",
87            Self::Unknown => "unknown",
88        }
89    }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ToolProbeFallbackMode {
95    Native,
96    Text,
97    Disabled,
98}
99
100impl ToolProbeFallbackMode {
101    pub fn as_str(&self) -> &'static str {
102        match self {
103            Self::Native => "native",
104            Self::Text => "text",
105            Self::Disabled => "disabled",
106        }
107    }
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct ToolConformanceReport {
112    pub schema_version: u32,
113    pub provider: String,
114    pub model: String,
115    #[serde(skip_serializing_if = "Option::is_none")]
116    pub base_url: Option<String>,
117    pub tool_name: String,
118    pub marker: String,
119    pub cases: Vec<ToolConformanceCase>,
120    pub tool_calling: ToolCallingConformanceSummary,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct ToolCallingConformanceSummary {
125    pub native: ToolProbeStatus,
126    pub text: ToolProbeStatus,
127    pub streaming_native: ToolProbeStatus,
128    pub fallback_mode: ToolProbeFallbackMode,
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub failure_reason: Option<String>,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct ToolConformanceCase {
135    pub mode: ToolProbeMode,
136    pub ok: bool,
137    pub classification: ToolProbeClassification,
138    pub fallback_mode: ToolProbeFallbackMode,
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub failure_reason: Option<String>,
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub http_status: Option<u16>,
143    #[serde(skip_serializing_if = "Option::is_none")]
144    pub elapsed_ms: Option<u64>,
145    pub native_tool_call_count: usize,
146    pub text_tool_call_count: usize,
147    #[serde(skip_serializing_if = "Vec::is_empty")]
148    pub parser_errors: Vec<String>,
149    #[serde(skip_serializing_if = "Vec::is_empty")]
150    pub protocol_violations: Vec<String>,
151    #[serde(skip_serializing_if = "Option::is_none")]
152    pub content_sample: Option<String>,
153}
154
155impl ToolConformanceCase {
156    fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
157        Self {
158            mode,
159            ok: false,
160            classification: ToolProbeClassification::TransportError,
161            fallback_mode: ToolProbeFallbackMode::Disabled,
162            failure_reason: Some(message),
163            http_status: None,
164            elapsed_ms,
165            native_tool_call_count: 0,
166            text_tool_call_count: 0,
167            parser_errors: Vec::new(),
168            protocol_violations: Vec::new(),
169            content_sample: None,
170        }
171    }
172
173    fn http_error(
174        mode: ToolProbeMode,
175        status: u16,
176        message: String,
177        elapsed_ms: Option<u64>,
178    ) -> Self {
179        Self {
180            mode,
181            ok: false,
182            classification: ToolProbeClassification::HttpError,
183            fallback_mode: ToolProbeFallbackMode::Disabled,
184            failure_reason: Some(message),
185            http_status: Some(status),
186            elapsed_ms,
187            native_tool_call_count: 0,
188            text_tool_call_count: 0,
189            parser_errors: Vec::new(),
190            protocol_violations: Vec::new(),
191            content_sample: None,
192        }
193    }
194}
195
196pub async fn run_tool_conformance_probe(
197    options: ToolConformanceProbeOptions,
198) -> ToolConformanceReport {
199    let model = llm_config::resolve_model_info(&options.model);
200    let provider = if options.provider.trim().is_empty() {
201        model.provider.clone()
202    } else {
203        options.provider.clone()
204    };
205    let model_id = resolved_probe_model_id(&model.id);
206    let base_url = options.base_url.clone().or_else(|| {
207        llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
208    });
209    let mut cases = Vec::new();
210    let modes = normalized_modes(&options.modes);
211    for _ in 0..options.repeat.max(1) {
212        for mode in &modes {
213            cases.push(
214                execute_live_probe_case(
215                    &provider,
216                    &model_id,
217                    base_url.as_deref(),
218                    *mode,
219                    &options.marker,
220                    options.timeout_secs,
221                )
222                .await,
223            );
224        }
225    }
226    report_from_cases(provider, model_id, base_url, options.marker, cases)
227}
228
229fn resolved_probe_model_id(selector: &str) -> String {
230    llm_config::wire_model_id(selector)
231}
232
233pub fn classify_tool_conformance_fixture(
234    provider: impl Into<String>,
235    model: impl Into<String>,
236    mode: ToolProbeMode,
237    marker: impl Into<String>,
238    raw: &str,
239) -> ToolConformanceReport {
240    let marker = marker.into();
241    let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
242    let case = classify_tool_probe_response(mode, &response, &marker, None, None);
243    report_from_cases(provider.into(), model.into(), None, marker, vec![case])
244}
245
246pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
247    match requirement {
248        "tool_probe" | "tool_call_probe" => {
249            report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
250                && report.cases.iter().any(|case| case.ok)
251        }
252        "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
253        "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
254        _ => false,
255    }
256}
257
258fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
259    if modes.is_empty() {
260        return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
261    }
262    let mut out = Vec::new();
263    for mode in modes {
264        if !out.contains(mode) {
265            out.push(*mode);
266        }
267    }
268    out
269}
270
271fn report_from_cases(
272    provider: String,
273    model: String,
274    base_url: Option<String>,
275    marker: String,
276    cases: Vec<ToolConformanceCase>,
277) -> ToolConformanceReport {
278    let summary = summarize_cases(&cases);
279    ToolConformanceReport {
280        schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
281        provider,
282        model,
283        base_url,
284        tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
285        marker,
286        cases,
287        tool_calling: summary,
288    }
289}
290
291fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
292    let native = summarize_native_mode(cases, ToolProbeMode::NonStreaming);
293    let streaming_native = summarize_native_mode(cases, ToolProbeMode::Streaming);
294    let text = summarize_text_mode(cases);
295
296    let fallback_mode =
297        if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
298            ToolProbeFallbackMode::Native
299        } else if text == ToolProbeStatus::Pass {
300            ToolProbeFallbackMode::Text
301        } else {
302            ToolProbeFallbackMode::Disabled
303        };
304
305    let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
306        cases.iter().find_map(|case| case.failure_reason.clone())
307    } else {
308        None
309    };
310
311    ToolCallingConformanceSummary {
312        native,
313        text,
314        streaming_native,
315        fallback_mode,
316        failure_reason,
317    }
318}
319
320fn summarize_native_mode(cases: &[ToolConformanceCase], mode: ToolProbeMode) -> ToolProbeStatus {
321    let mut saw_mode = false;
322    let mut all_passed = true;
323    for case in cases.iter().filter(|case| case.mode == mode) {
324        saw_mode = true;
325        if !(case.ok && case.classification == ToolProbeClassification::StructuredNativeToolCall) {
326            all_passed = false;
327        }
328    }
329    match (saw_mode, all_passed) {
330        (false, _) => ToolProbeStatus::Unknown,
331        (true, true) => ToolProbeStatus::Pass,
332        (true, false) => ToolProbeStatus::Fail,
333    }
334}
335
336fn summarize_text_mode(cases: &[ToolConformanceCase]) -> ToolProbeStatus {
337    let mut saw_text = false;
338    let mut saw_passing_mode = false;
339    for mode in [ToolProbeMode::NonStreaming, ToolProbeMode::Streaming] {
340        let mut saw_mode = false;
341        let mut saw_text_in_mode = false;
342        let mut all_mode_cases_passed = true;
343        for case in cases.iter().filter(|case| case.mode == mode) {
344            saw_mode = true;
345            saw_text_in_mode |= case.classification
346                == ToolProbeClassification::ParseableHarnTextToolCall
347                || case.text_tool_call_count > 0;
348            if !(case.ok
349                && case.classification == ToolProbeClassification::ParseableHarnTextToolCall)
350            {
351                all_mode_cases_passed = false;
352            }
353        }
354        saw_text |= saw_text_in_mode;
355        if saw_mode && saw_text_in_mode && all_mode_cases_passed {
356            saw_passing_mode = true;
357        }
358    }
359    if !saw_text {
360        return ToolProbeStatus::Unknown;
361    }
362    if saw_passing_mode {
363        ToolProbeStatus::Pass
364    } else {
365        ToolProbeStatus::Fail
366    }
367}
368
369async fn execute_live_probe_case(
370    provider: &str,
371    model: &str,
372    base_url: Option<&str>,
373    mode: ToolProbeMode,
374    marker: &str,
375    timeout_secs: u64,
376) -> ToolConformanceCase {
377    let clock = harn_clock::RealClock::arc();
378    let started_ms = clock.monotonic_ms();
379    let Some(def) = llm_config::provider_config(provider) else {
380        return ToolConformanceCase::transport_error(
381            mode,
382            format!("unknown provider: {provider}"),
383            Some(elapsed_ms(&*clock, started_ms)),
384        );
385    };
386    let base_url = base_url
387        .filter(|value| !value.trim().is_empty())
388        .map(str::to_string)
389        .unwrap_or_else(|| llm_config::resolve_base_url(&def));
390    let url = match chat_url(&def, &base_url) {
391        Ok(url) => url,
392        Err(message) => {
393            return ToolConformanceCase::transport_error(
394                mode,
395                message,
396                Some(elapsed_ms(&*clock, started_ms)),
397            );
398        }
399    };
400    let body = probe_request_body(provider, model, mode, marker);
401    let client = if mode == ToolProbeMode::Streaming {
402        crate::llm::shared_streaming_client().clone()
403    } else {
404        crate::llm::shared_blocking_client().clone()
405    };
406    let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
407    let request = client
408        .post(&url)
409        .header("Content-Type", "application/json")
410        .timeout(std::time::Duration::from_secs(timeout_secs))
411        .json(&body);
412    let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
413    for (name, value) in &def.extra_headers {
414        request = request.header(name.as_str(), value.as_str());
415    }
416
417    let response = match request.send().await {
418        Ok(response) => response,
419        Err(error) => {
420            return ToolConformanceCase::transport_error(
421                mode,
422                format!("provider request failed: {error}"),
423                Some(elapsed_ms(&*clock, started_ms)),
424            );
425        }
426    };
427    let status = response.status();
428    let text = match response.text().await {
429        Ok(text) => text,
430        Err(error) => {
431            return ToolConformanceCase::transport_error(
432                mode,
433                format!("provider response was unreadable: {error}"),
434                Some(elapsed_ms(&*clock, started_ms)),
435            );
436        }
437    };
438    let elapsed = Some(elapsed_ms(&*clock, started_ms));
439    if !status.is_success() {
440        return ToolConformanceCase::http_error(
441            mode,
442            status.as_u16(),
443            sample_failure(&text, "provider returned non-success HTTP status"),
444            elapsed,
445        );
446    }
447    let response_value = if mode == ToolProbeMode::Streaming {
448        aggregate_stream_text(&text, provider)
449    } else {
450        serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
451    };
452    classify_tool_probe_response(
453        mode,
454        &response_value,
455        marker,
456        Some(status.as_u16()),
457        elapsed,
458    )
459}
460
461/// True when `calls` contains the probe's echo_marker call (the
462/// `TOOL_PROBE_TOOL_NAME` tool with `args.value == marker`). Shared by the
463/// tagged and fenced-JSON text-channel parse attempts.
464fn probe_marker_present(calls: &[Value], marker: &str) -> bool {
465    calls.iter().any(|call| {
466        call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
467            && call
468                .get("arguments")
469                .and_then(|args| args.get("value"))
470                .and_then(Value::as_str)
471                == Some(marker)
472    })
473}
474
475fn classify_tool_probe_response(
476    mode: ToolProbeMode,
477    response: &Value,
478    marker: &str,
479    http_status: Option<u16>,
480    elapsed_ms: Option<u64>,
481) -> ToolConformanceCase {
482    let native = extract_native_tool_calls(response);
483    let native_count = native.len();
484    let mut malformed_native = false;
485    for call in &native {
486        if call.name == TOOL_PROBE_TOOL_NAME {
487            match &call.arguments {
488                Some(Value::Object(map))
489                    if map.get("value").and_then(Value::as_str) == Some(marker) =>
490                {
491                    return ToolConformanceCase {
492                        mode,
493                        ok: true,
494                        classification: ToolProbeClassification::StructuredNativeToolCall,
495                        fallback_mode: ToolProbeFallbackMode::Native,
496                        failure_reason: None,
497                        http_status,
498                        elapsed_ms,
499                        native_tool_call_count: native_count,
500                        text_tool_call_count: 0,
501                        parser_errors: Vec::new(),
502                        protocol_violations: Vec::new(),
503                        content_sample: content_sample(response),
504                    };
505                }
506                Some(Value::Object(_)) => {}
507                _ => malformed_native = true,
508            }
509        }
510    }
511
512    let content = extract_content(response);
513    let tools = probe_tool_registry();
514    // Try the canonical tagged/heredoc grammar first; if it does not yield the
515    // echo_marker call, also try the fenced-JSON grammar. A fenced-JSON
516    // emission that parses to the probe call still classifies as
517    // ParseableHarnTextToolCall (it is a text-channel format) — the taxonomy is
518    // unchanged, only the body grammar the text path accepts is extended.
519    let tagged = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
520    let parsed = if probe_marker_present(&tagged.calls, marker) {
521        tagged
522    } else {
523        let fenced = crate::llm::tools::parse_fenced_json_tool_calls(&content);
524        if probe_marker_present(&fenced.calls, marker) {
525            fenced
526        } else {
527            tagged
528        }
529    };
530    let text_count = parsed.calls.len();
531    let text_pass = probe_marker_present(&parsed.calls, marker);
532    if text_pass {
533        return ToolConformanceCase {
534            mode,
535            ok: true,
536            classification: ToolProbeClassification::ParseableHarnTextToolCall,
537            fallback_mode: ToolProbeFallbackMode::Text,
538            failure_reason: None,
539            http_status,
540            elapsed_ms,
541            native_tool_call_count: native_count,
542            text_tool_call_count: text_count,
543            parser_errors: parsed.errors,
544            protocol_violations: parsed.violations,
545            content_sample: sample_content(&content),
546        };
547    }
548
549    let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
550        (
551            ToolProbeClassification::MalformedJsonArguments,
552            Some(first_non_empty(
553                parsed.errors.first().cloned(),
554                "malformed_tool_arguments",
555            )),
556        )
557    } else if content.trim().is_empty() && native_count == 0 {
558        (
559            ToolProbeClassification::EmptySilent,
560            Some("empty_silent_response".to_string()),
561        )
562    } else if has_raw_model_tool_tag(&content) {
563        (
564            ToolProbeClassification::RawModelToolTag,
565            Some("raw_tool_tag_no_structured_calls".to_string()),
566        )
567    } else {
568        (
569            ToolProbeClassification::ProseOnlyNonTool,
570            Some("no_executable_tool_call".to_string()),
571        )
572    };
573
574    ToolConformanceCase {
575        mode,
576        ok: false,
577        classification,
578        fallback_mode: ToolProbeFallbackMode::Disabled,
579        failure_reason,
580        http_status,
581        elapsed_ms,
582        native_tool_call_count: native_count,
583        text_tool_call_count: text_count,
584        parser_errors: parsed.errors,
585        protocol_violations: parsed.violations,
586        content_sample: sample_content(&content),
587    }
588}
589
590fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
591    let endpoint = if def.chat_endpoint.trim().is_empty() {
592        "/v1/chat/completions"
593    } else {
594        def.chat_endpoint.as_str()
595    };
596    let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
597        endpoint.to_string()
598    } else if endpoint.starts_with('/') {
599        format!("{}{}", base_url.trim_end_matches('/'), endpoint)
600    } else {
601        format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
602    };
603    reqwest::Url::parse(&url)
604        .map(|_| url.clone())
605        .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
606}
607
608fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
609    let prompt = format!(
610        "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
611    );
612    let tool = json!({
613        "type": "function",
614        "function": {
615            "name": TOOL_PROBE_TOOL_NAME,
616            "description": "Echo the probe marker exactly.",
617            "parameters": {
618                "type": "object",
619                "properties": {
620                    "value": {
621                        "type": "string",
622                        "description": "The marker value to echo."
623                    }
624                },
625                "required": ["value"],
626                "additionalProperties": false
627            }
628        }
629    });
630    let mut body = json!({
631        "model": model,
632        "messages": [{"role": "user", "content": prompt}],
633        "tools": [tool],
634        "stream": mode == ToolProbeMode::Streaming,
635        "temperature": 0,
636    });
637    if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
638        body["tool_choice"] = json!({
639            "type": "function",
640            "function": {"name": TOOL_PROBE_TOOL_NAME}
641        });
642    }
643    body
644}
645
646#[derive(Debug)]
647struct NativeToolCall {
648    name: String,
649    arguments: Option<Value>,
650}
651
652fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
653    let mut calls = Vec::new();
654    visit_native_tool_call_arrays(response, &mut calls);
655    calls
656}
657
658fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
659    match value {
660        Value::Object(map) => {
661            if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
662                for item in tool_calls {
663                    if let Some(call) = parse_native_tool_call(item) {
664                        calls.push(call);
665                    }
666                }
667            }
668            for child in map.values() {
669                visit_native_tool_call_arrays(child, calls);
670            }
671        }
672        Value::Array(items) => {
673            for item in items {
674                visit_native_tool_call_arrays(item, calls);
675            }
676        }
677        _ => {}
678    }
679}
680
681fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
682    let obj = item.as_object()?;
683    let function = obj.get("function").and_then(Value::as_object);
684    let name = function
685        .and_then(|function| function.get("name"))
686        .or_else(|| obj.get("name"))
687        .and_then(Value::as_str)?
688        .to_string();
689    let raw_args = function
690        .and_then(|function| function.get("arguments"))
691        .or_else(|| obj.get("arguments"));
692    let arguments = match raw_args {
693        Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
694        Some(value @ Value::Object(_)) => Some(value.clone()),
695        Some(_) => None,
696        None => Some(json!({})),
697    };
698    Some(NativeToolCall { name, arguments })
699}
700
701fn extract_content(response: &Value) -> String {
702    let mut parts = Vec::new();
703    visit_content(response, &mut parts);
704    parts
705        .into_iter()
706        .filter(|part| !part.trim().is_empty())
707        .collect::<Vec<_>>()
708        .join("\n")
709}
710
711fn visit_content(value: &Value, parts: &mut Vec<String>) {
712    match value {
713        Value::Object(map) => {
714            for key in ["content", "response", "text"] {
715                if let Some(text) = map.get(key).and_then(Value::as_str) {
716                    parts.push(text.to_string());
717                }
718            }
719            for child in map.values() {
720                visit_content(child, parts);
721            }
722        }
723        Value::Array(items) => {
724            for item in items {
725                visit_content(item, parts);
726            }
727        }
728        _ => {}
729    }
730}
731
732fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
733    let mut content = String::new();
734    let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
735    let mut frames = Vec::new();
736    for raw_line in text.lines() {
737        let line = raw_line.trim();
738        if line.is_empty() {
739            continue;
740        }
741        let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
742        if payload == "[DONE]" {
743            continue;
744        }
745        let Ok(frame) = serde_json::from_str::<Value>(payload) else {
746            continue;
747        };
748        collect_stream_content_and_calls(&frame, &mut content, &mut calls);
749        frames.push(frame);
750    }
751    let tool_calls: Vec<Value> = calls
752        .into_values()
753        .map(|call| {
754            json!({
755                "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
756                "type": "function",
757                "function": {
758                    "name": call.name.unwrap_or_default(),
759                    "arguments": call.arguments,
760                }
761            })
762        })
763        .collect();
764    json!({
765        "content": content,
766        "tool_calls": tool_calls,
767        "frames": frames,
768    })
769}
770
771#[derive(Debug, Default)]
772struct PartialStreamCall {
773    id: Option<String>,
774    name: Option<String>,
775    arguments: String,
776}
777
778fn collect_stream_content_and_calls(
779    frame: &Value,
780    content: &mut String,
781    calls: &mut BTreeMap<String, PartialStreamCall>,
782) {
783    if let Some(text) = frame
784        .pointer("/message/content")
785        .or_else(|| frame.pointer("/choices/0/delta/content"))
786        .or_else(|| frame.pointer("/choices/0/message/content"))
787        .or_else(|| frame.get("response"))
788        .and_then(Value::as_str)
789    {
790        content.push_str(text);
791    }
792    for item in frame
793        .pointer("/message/tool_calls")
794        .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
795        .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
796        .and_then(Value::as_array)
797        .into_iter()
798        .flatten()
799    {
800        let key = item
801            .get("index")
802            .and_then(Value::as_u64)
803            .map(|index| index.to_string())
804            .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
805            .unwrap_or_else(|| calls.len().to_string());
806        let slot = calls.entry(key).or_default();
807        if let Some(id) = item.get("id").and_then(Value::as_str) {
808            slot.id = Some(id.to_string());
809        }
810        if let Some(name) = item
811            .pointer("/function/name")
812            .or_else(|| item.get("name"))
813            .and_then(Value::as_str)
814        {
815            slot.name = Some(name.to_string());
816        }
817        if let Some(arguments) = item
818            .pointer("/function/arguments")
819            .or_else(|| item.get("arguments"))
820        {
821            match arguments {
822                Value::String(delta) => slot.arguments.push_str(delta),
823                Value::Object(_) => slot.arguments = arguments.to_string(),
824                _ => {}
825            }
826        }
827    }
828}
829
830fn probe_tool_registry() -> VmValue {
831    let mut value_param = BTreeMap::new();
832    value_param.insert("type".to_string(), vm_str("string"));
833    value_param.insert(
834        "description".to_string(),
835        vm_str("The marker value to echo."),
836    );
837    let mut params = BTreeMap::new();
838    params.insert("value".to_string(), VmValue::dict(value_param));
839    let tool = vm_dict(&[
840        ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
841        ("description", vm_str("Echo the probe marker exactly.")),
842        ("parameters", VmValue::dict(params)),
843    ]);
844    vm_dict(&[("tools", VmValue::List(std::sync::Arc::new(vec![tool])))])
845}
846
847fn vm_str(value: &str) -> VmValue {
848    VmValue::String(arcstr::ArcStr::from(value))
849}
850
851fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
852    let mut map = BTreeMap::new();
853    for (key, value) in pairs {
854        map.insert((*key).to_string(), value.clone());
855    }
856    VmValue::dict(map)
857}
858
859fn has_raw_model_tool_tag(content: &str) -> bool {
860    let lowered = content.to_ascii_lowercase();
861    lowered.contains("<tool_call")
862        || lowered.contains("<toolcall")
863        || lowered.contains("tool_code:")
864        || lowered.contains("tool_call:")
865        || lowered.contains("call:")
866        || lowered.contains("<function")
867}
868
869fn content_sample(response: &Value) -> Option<String> {
870    sample_content(&extract_content(response))
871}
872
873fn sample_content(content: &str) -> Option<String> {
874    let trimmed = content.trim();
875    if trimmed.is_empty() {
876        None
877    } else {
878        Some(trimmed.chars().take(240).collect())
879    }
880}
881
882fn sample_failure(text: &str, fallback: &str) -> String {
883    let trimmed = text.trim();
884    if trimmed.is_empty() {
885        fallback.to_string()
886    } else {
887        format!(
888            "{fallback}: {}",
889            trimmed.chars().take(240).collect::<String>()
890        )
891    }
892}
893
894fn first_non_empty(value: Option<String>, fallback: &str) -> String {
895    value
896        .filter(|value| !value.trim().is_empty())
897        .unwrap_or_else(|| fallback.to_string())
898}
899
900fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
901    clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
902}
903
904#[cfg(test)]
905mod tests {
906    use super::*;
907
908    #[test]
909    fn probe_resolves_catalog_key_to_provider_wire_model() {
910        let resolved = llm_config::resolve_model_info("baseten-glm-5.2");
911        assert_eq!(resolved_probe_model_id(&resolved.id), "zai-org/GLM-5.2");
912    }
913
914    #[test]
915    fn classify_openai_native_tool_call_as_pass() {
916        let report = classify_tool_conformance_fixture(
917            "local",
918            "model",
919            ToolProbeMode::NonStreaming,
920            DEFAULT_TOOL_PROBE_MARKER,
921            r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
922        );
923        assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
924        assert_eq!(
925            report.tool_calling.fallback_mode,
926            ToolProbeFallbackMode::Native
927        );
928        assert_eq!(
929            report.cases[0].classification,
930            ToolProbeClassification::StructuredNativeToolCall
931        );
932    }
933
934    #[test]
935    fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
936        let report = classify_tool_conformance_fixture(
937            "ollama",
938            "gemma4:26b",
939            ToolProbeMode::NonStreaming,
940            DEFAULT_TOOL_PROBE_MARKER,
941            r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
942        );
943        assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
944        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
945        assert_eq!(
946            report.tool_calling.fallback_mode,
947            ToolProbeFallbackMode::Text
948        );
949        assert_eq!(
950            report.cases[0].classification,
951            ToolProbeClassification::ParseableHarnTextToolCall
952        );
953    }
954
955    #[test]
956    fn classify_qwen_call_colon_marker_as_text_fallback() {
957        let report = classify_tool_conformance_fixture(
958            "llamacpp",
959            "qwen",
960            ToolProbeMode::NonStreaming,
961            DEFAULT_TOOL_PROBE_MARKER,
962            r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
963        );
964        assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
965        assert_eq!(
966            report.tool_calling.fallback_mode,
967            ToolProbeFallbackMode::Text
968        );
969    }
970
971    #[test]
972    fn classify_prose_only_as_disabled() {
973        let report = classify_tool_conformance_fixture(
974            "ollama",
975            "gemma4:26b",
976            ToolProbeMode::NonStreaming,
977            DEFAULT_TOOL_PROBE_MARKER,
978            r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
979        );
980        assert_eq!(
981            report.tool_calling.fallback_mode,
982            ToolProbeFallbackMode::Disabled
983        );
984        assert_eq!(
985            report.cases[0].classification,
986            ToolProbeClassification::ProseOnlyNonTool
987        );
988        assert_eq!(
989            report.cases[0].failure_reason.as_deref(),
990            Some("no_executable_tool_call")
991        );
992    }
993
994    #[test]
995    fn aggregates_openai_streaming_tool_call_deltas() {
996        let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
997                   data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
998                   data: [DONE]\n";
999        let response = aggregate_stream_text(raw, "local");
1000        let case = classify_tool_probe_response(
1001            ToolProbeMode::Streaming,
1002            &response,
1003            DEFAULT_TOOL_PROBE_MARKER,
1004            None,
1005            None,
1006        );
1007        assert!(case.ok, "{case:?}");
1008        assert_eq!(
1009            case.classification,
1010            ToolProbeClassification::StructuredNativeToolCall
1011        );
1012    }
1013
1014    #[test]
1015    fn report_satisfies_tool_probe_when_text_fallback_passes() {
1016        let report = classify_tool_conformance_fixture(
1017            "llamacpp",
1018            "qwen",
1019            ToolProbeMode::NonStreaming,
1020            DEFAULT_TOOL_PROBE_MARKER,
1021            r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
1022        );
1023        assert!(report_satisfies_required_probe(&report, "tool_probe"));
1024        assert!(!report_satisfies_required_probe(
1025            &report,
1026            "native_tool_probe"
1027        ));
1028    }
1029
1030    #[test]
1031    fn summary_requires_every_repeated_native_case_to_pass() {
1032        let summary = summarize_cases(&[
1033            probe_case(
1034                ToolProbeMode::NonStreaming,
1035                true,
1036                ToolProbeClassification::StructuredNativeToolCall,
1037            ),
1038            probe_case(
1039                ToolProbeMode::NonStreaming,
1040                false,
1041                ToolProbeClassification::ProseOnlyNonTool,
1042            ),
1043        ]);
1044        assert_eq!(summary.native, ToolProbeStatus::Fail);
1045        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1046    }
1047
1048    #[test]
1049    fn summary_requires_every_repeated_text_case_to_pass() {
1050        let summary = summarize_cases(&[
1051            probe_case(
1052                ToolProbeMode::NonStreaming,
1053                true,
1054                ToolProbeClassification::ParseableHarnTextToolCall,
1055            ),
1056            probe_case(
1057                ToolProbeMode::NonStreaming,
1058                false,
1059                ToolProbeClassification::MalformedJsonArguments,
1060            ),
1061        ]);
1062        assert_eq!(summary.native, ToolProbeStatus::Fail);
1063        assert_eq!(summary.text, ToolProbeStatus::Fail);
1064        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1065    }
1066
1067    #[test]
1068    fn summary_preserves_nonstreaming_text_fallback_when_streaming_fails() {
1069        let summary = summarize_cases(&[
1070            probe_case(
1071                ToolProbeMode::NonStreaming,
1072                true,
1073                ToolProbeClassification::ParseableHarnTextToolCall,
1074            ),
1075            probe_case(
1076                ToolProbeMode::Streaming,
1077                false,
1078                ToolProbeClassification::ProseOnlyNonTool,
1079            ),
1080        ]);
1081        assert_eq!(summary.native, ToolProbeStatus::Fail);
1082        assert_eq!(summary.streaming_native, ToolProbeStatus::Fail);
1083        assert_eq!(summary.text, ToolProbeStatus::Pass);
1084        assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Text);
1085    }
1086
1087    fn probe_case(
1088        mode: ToolProbeMode,
1089        ok: bool,
1090        classification: ToolProbeClassification,
1091    ) -> ToolConformanceCase {
1092        let native_tool_call_count =
1093            usize::from(classification == ToolProbeClassification::StructuredNativeToolCall);
1094        let text_tool_call_count =
1095            usize::from(classification == ToolProbeClassification::ParseableHarnTextToolCall);
1096        ToolConformanceCase {
1097            mode,
1098            ok,
1099            classification,
1100            fallback_mode: ToolProbeFallbackMode::Disabled,
1101            failure_reason: None,
1102            http_status: None,
1103            elapsed_ms: None,
1104            native_tool_call_count,
1105            text_tool_call_count,
1106            parser_errors: Vec::new(),
1107            protocol_violations: Vec::new(),
1108            content_sample: None,
1109        }
1110    }
1111}