1use std::collections::BTreeMap;
9
10use serde::{Deserialize, Serialize};
11use serde_json::{json, Value};
12
13use crate::llm_config::{self, ProviderDef};
14use crate::value::VmValue;
15
16pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
17pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
18pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
19
20#[derive(Debug, Clone)]
21pub struct ToolConformanceProbeOptions {
22 pub provider: String,
23 pub model: String,
24 pub base_url: Option<String>,
25 pub modes: Vec<ToolProbeMode>,
26 pub marker: String,
27 pub repeat: usize,
28 pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32 pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33 Self {
34 provider: provider.into(),
35 model: model.into(),
36 base_url: None,
37 modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38 marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39 repeat: 1,
40 timeout_secs: 120,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ToolProbeMode {
48 NonStreaming,
49 Streaming,
50}
51
52impl ToolProbeMode {
53 pub fn as_str(self) -> &'static str {
54 match self {
55 Self::NonStreaming => "non_streaming",
56 Self::Streaming => "streaming",
57 }
58 }
59}
60
61#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum ToolProbeClassification {
64 StructuredNativeToolCall,
65 ParseableHarnTextToolCall,
66 RawModelToolTag,
67 ProseOnlyNonTool,
68 MalformedJsonArguments,
69 EmptySilent,
70 HttpError,
71 TransportError,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum ToolProbeStatus {
77 Pass,
78 Fail,
79 Unknown,
80}
81
82impl ToolProbeStatus {
83 pub fn as_str(&self) -> &'static str {
84 match self {
85 Self::Pass => "pass",
86 Self::Fail => "fail",
87 Self::Unknown => "unknown",
88 }
89 }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ToolProbeFallbackMode {
95 Native,
96 Text,
97 Disabled,
98}
99
100impl ToolProbeFallbackMode {
101 pub fn as_str(&self) -> &'static str {
102 match self {
103 Self::Native => "native",
104 Self::Text => "text",
105 Self::Disabled => "disabled",
106 }
107 }
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct ToolConformanceReport {
112 pub schema_version: u32,
113 pub provider: String,
114 pub model: String,
115 #[serde(skip_serializing_if = "Option::is_none")]
116 pub base_url: Option<String>,
117 pub tool_name: String,
118 pub marker: String,
119 pub cases: Vec<ToolConformanceCase>,
120 pub tool_calling: ToolCallingConformanceSummary,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct ToolCallingConformanceSummary {
125 pub native: ToolProbeStatus,
126 pub text: ToolProbeStatus,
127 pub streaming_native: ToolProbeStatus,
128 pub fallback_mode: ToolProbeFallbackMode,
129 #[serde(skip_serializing_if = "Option::is_none")]
130 pub failure_reason: Option<String>,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct ToolConformanceCase {
135 pub mode: ToolProbeMode,
136 pub ok: bool,
137 pub classification: ToolProbeClassification,
138 pub fallback_mode: ToolProbeFallbackMode,
139 #[serde(skip_serializing_if = "Option::is_none")]
140 pub failure_reason: Option<String>,
141 #[serde(skip_serializing_if = "Option::is_none")]
142 pub http_status: Option<u16>,
143 #[serde(skip_serializing_if = "Option::is_none")]
144 pub elapsed_ms: Option<u64>,
145 pub native_tool_call_count: usize,
146 pub text_tool_call_count: usize,
147 #[serde(skip_serializing_if = "Vec::is_empty")]
148 pub parser_errors: Vec<String>,
149 #[serde(skip_serializing_if = "Vec::is_empty")]
150 pub protocol_violations: Vec<String>,
151 #[serde(skip_serializing_if = "Option::is_none")]
152 pub content_sample: Option<String>,
153}
154
155impl ToolConformanceCase {
156 fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
157 Self {
158 mode,
159 ok: false,
160 classification: ToolProbeClassification::TransportError,
161 fallback_mode: ToolProbeFallbackMode::Disabled,
162 failure_reason: Some(message),
163 http_status: None,
164 elapsed_ms,
165 native_tool_call_count: 0,
166 text_tool_call_count: 0,
167 parser_errors: Vec::new(),
168 protocol_violations: Vec::new(),
169 content_sample: None,
170 }
171 }
172
173 fn http_error(
174 mode: ToolProbeMode,
175 status: u16,
176 message: String,
177 elapsed_ms: Option<u64>,
178 ) -> Self {
179 Self {
180 mode,
181 ok: false,
182 classification: ToolProbeClassification::HttpError,
183 fallback_mode: ToolProbeFallbackMode::Disabled,
184 failure_reason: Some(message),
185 http_status: Some(status),
186 elapsed_ms,
187 native_tool_call_count: 0,
188 text_tool_call_count: 0,
189 parser_errors: Vec::new(),
190 protocol_violations: Vec::new(),
191 content_sample: None,
192 }
193 }
194}
195
196pub async fn run_tool_conformance_probe(
197 options: ToolConformanceProbeOptions,
198) -> ToolConformanceReport {
199 let model = llm_config::resolve_model_info(&options.model);
200 let provider = if options.provider.trim().is_empty() {
201 model.provider.clone()
202 } else {
203 options.provider.clone()
204 };
205 let model_id = resolved_probe_model_id(&model.id);
206 let base_url = options.base_url.clone().or_else(|| {
207 llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
208 });
209 let mut cases = Vec::new();
210 let modes = normalized_modes(&options.modes);
211 for _ in 0..options.repeat.max(1) {
212 for mode in &modes {
213 cases.push(
214 execute_live_probe_case(
215 &provider,
216 &model_id,
217 base_url.as_deref(),
218 *mode,
219 &options.marker,
220 options.timeout_secs,
221 )
222 .await,
223 );
224 }
225 }
226 report_from_cases(provider, model_id, base_url, options.marker, cases)
227}
228
229fn resolved_probe_model_id(selector: &str) -> String {
230 llm_config::wire_model_id(selector)
231}
232
233pub fn classify_tool_conformance_fixture(
234 provider: impl Into<String>,
235 model: impl Into<String>,
236 mode: ToolProbeMode,
237 marker: impl Into<String>,
238 raw: &str,
239) -> ToolConformanceReport {
240 let marker = marker.into();
241 let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
242 let case = classify_tool_probe_response(mode, &response, &marker, None, None);
243 report_from_cases(provider.into(), model.into(), None, marker, vec![case])
244}
245
246pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
247 match requirement {
248 "tool_probe" | "tool_call_probe" => {
249 report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
250 && report.cases.iter().any(|case| case.ok)
251 }
252 "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
253 "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
254 _ => false,
255 }
256}
257
258fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
259 if modes.is_empty() {
260 return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
261 }
262 let mut out = Vec::new();
263 for mode in modes {
264 if !out.contains(mode) {
265 out.push(*mode);
266 }
267 }
268 out
269}
270
271fn report_from_cases(
272 provider: String,
273 model: String,
274 base_url: Option<String>,
275 marker: String,
276 cases: Vec<ToolConformanceCase>,
277) -> ToolConformanceReport {
278 let summary = summarize_cases(&cases);
279 ToolConformanceReport {
280 schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
281 provider,
282 model,
283 base_url,
284 tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
285 marker,
286 cases,
287 tool_calling: summary,
288 }
289}
290
291fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
292 let native = summarize_native_mode(cases, ToolProbeMode::NonStreaming);
293 let streaming_native = summarize_native_mode(cases, ToolProbeMode::Streaming);
294 let text = summarize_text_mode(cases);
295
296 let fallback_mode =
297 if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
298 ToolProbeFallbackMode::Native
299 } else if text == ToolProbeStatus::Pass {
300 ToolProbeFallbackMode::Text
301 } else {
302 ToolProbeFallbackMode::Disabled
303 };
304
305 let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
306 cases.iter().find_map(|case| case.failure_reason.clone())
307 } else {
308 None
309 };
310
311 ToolCallingConformanceSummary {
312 native,
313 text,
314 streaming_native,
315 fallback_mode,
316 failure_reason,
317 }
318}
319
320fn summarize_native_mode(cases: &[ToolConformanceCase], mode: ToolProbeMode) -> ToolProbeStatus {
321 let mut saw_mode = false;
322 let mut all_passed = true;
323 for case in cases.iter().filter(|case| case.mode == mode) {
324 saw_mode = true;
325 if !(case.ok && case.classification == ToolProbeClassification::StructuredNativeToolCall) {
326 all_passed = false;
327 }
328 }
329 match (saw_mode, all_passed) {
330 (false, _) => ToolProbeStatus::Unknown,
331 (true, true) => ToolProbeStatus::Pass,
332 (true, false) => ToolProbeStatus::Fail,
333 }
334}
335
336fn summarize_text_mode(cases: &[ToolConformanceCase]) -> ToolProbeStatus {
337 let mut saw_text = false;
338 let mut saw_passing_mode = false;
339 for mode in [ToolProbeMode::NonStreaming, ToolProbeMode::Streaming] {
340 let mut saw_mode = false;
341 let mut saw_text_in_mode = false;
342 let mut all_mode_cases_passed = true;
343 for case in cases.iter().filter(|case| case.mode == mode) {
344 saw_mode = true;
345 saw_text_in_mode |= case.classification
346 == ToolProbeClassification::ParseableHarnTextToolCall
347 || case.text_tool_call_count > 0;
348 if !(case.ok
349 && case.classification == ToolProbeClassification::ParseableHarnTextToolCall)
350 {
351 all_mode_cases_passed = false;
352 }
353 }
354 saw_text |= saw_text_in_mode;
355 if saw_mode && saw_text_in_mode && all_mode_cases_passed {
356 saw_passing_mode = true;
357 }
358 }
359 if !saw_text {
360 return ToolProbeStatus::Unknown;
361 }
362 if saw_passing_mode {
363 ToolProbeStatus::Pass
364 } else {
365 ToolProbeStatus::Fail
366 }
367}
368
369async fn execute_live_probe_case(
370 provider: &str,
371 model: &str,
372 base_url: Option<&str>,
373 mode: ToolProbeMode,
374 marker: &str,
375 timeout_secs: u64,
376) -> ToolConformanceCase {
377 let clock = harn_clock::RealClock::arc();
378 let started_ms = clock.monotonic_ms();
379 let Some(def) = llm_config::provider_config(provider) else {
380 return ToolConformanceCase::transport_error(
381 mode,
382 format!("unknown provider: {provider}"),
383 Some(elapsed_ms(&*clock, started_ms)),
384 );
385 };
386 let base_url = base_url
387 .filter(|value| !value.trim().is_empty())
388 .map(str::to_string)
389 .unwrap_or_else(|| llm_config::resolve_base_url(&def));
390 let url = match chat_url(&def, &base_url) {
391 Ok(url) => url,
392 Err(message) => {
393 return ToolConformanceCase::transport_error(
394 mode,
395 message,
396 Some(elapsed_ms(&*clock, started_ms)),
397 );
398 }
399 };
400 let body = probe_request_body(provider, model, mode, marker);
401 let client = if mode == ToolProbeMode::Streaming {
402 crate::llm::shared_streaming_client().clone()
403 } else {
404 crate::llm::shared_blocking_client().clone()
405 };
406 let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
407 let request = client
408 .post(&url)
409 .header("Content-Type", "application/json")
410 .timeout(std::time::Duration::from_secs(timeout_secs))
411 .json(&body);
412 let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
413 for (name, value) in &def.extra_headers {
414 request = request.header(name.as_str(), value.as_str());
415 }
416
417 let response = match request.send().await {
418 Ok(response) => response,
419 Err(error) => {
420 return ToolConformanceCase::transport_error(
421 mode,
422 format!("provider request failed: {error}"),
423 Some(elapsed_ms(&*clock, started_ms)),
424 );
425 }
426 };
427 let status = response.status();
428 let text = match response.text().await {
429 Ok(text) => text,
430 Err(error) => {
431 return ToolConformanceCase::transport_error(
432 mode,
433 format!("provider response was unreadable: {error}"),
434 Some(elapsed_ms(&*clock, started_ms)),
435 );
436 }
437 };
438 let elapsed = Some(elapsed_ms(&*clock, started_ms));
439 if !status.is_success() {
440 return ToolConformanceCase::http_error(
441 mode,
442 status.as_u16(),
443 sample_failure(&text, "provider returned non-success HTTP status"),
444 elapsed,
445 );
446 }
447 let response_value = if mode == ToolProbeMode::Streaming {
448 aggregate_stream_text(&text, provider)
449 } else {
450 serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
451 };
452 classify_tool_probe_response(
453 mode,
454 &response_value,
455 marker,
456 Some(status.as_u16()),
457 elapsed,
458 )
459}
460
461fn probe_marker_present(calls: &[Value], marker: &str) -> bool {
465 calls.iter().any(|call| {
466 call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
467 && call
468 .get("arguments")
469 .and_then(|args| args.get("value"))
470 .and_then(Value::as_str)
471 == Some(marker)
472 })
473}
474
475fn classify_tool_probe_response(
476 mode: ToolProbeMode,
477 response: &Value,
478 marker: &str,
479 http_status: Option<u16>,
480 elapsed_ms: Option<u64>,
481) -> ToolConformanceCase {
482 let native = extract_native_tool_calls(response);
483 let native_count = native.len();
484 let mut malformed_native = false;
485 for call in &native {
486 if call.name == TOOL_PROBE_TOOL_NAME {
487 match &call.arguments {
488 Some(Value::Object(map))
489 if map.get("value").and_then(Value::as_str) == Some(marker) =>
490 {
491 return ToolConformanceCase {
492 mode,
493 ok: true,
494 classification: ToolProbeClassification::StructuredNativeToolCall,
495 fallback_mode: ToolProbeFallbackMode::Native,
496 failure_reason: None,
497 http_status,
498 elapsed_ms,
499 native_tool_call_count: native_count,
500 text_tool_call_count: 0,
501 parser_errors: Vec::new(),
502 protocol_violations: Vec::new(),
503 content_sample: content_sample(response),
504 };
505 }
506 Some(Value::Object(_)) => {}
507 _ => malformed_native = true,
508 }
509 }
510 }
511
512 let content = extract_content(response);
513 let tools = probe_tool_registry();
514 let tagged = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
520 let parsed = if probe_marker_present(&tagged.calls, marker) {
521 tagged
522 } else {
523 let fenced = crate::llm::tools::parse_fenced_json_tool_calls(&content);
524 if probe_marker_present(&fenced.calls, marker) {
525 fenced
526 } else {
527 tagged
528 }
529 };
530 let text_count = parsed.calls.len();
531 let text_pass = probe_marker_present(&parsed.calls, marker);
532 if text_pass {
533 return ToolConformanceCase {
534 mode,
535 ok: true,
536 classification: ToolProbeClassification::ParseableHarnTextToolCall,
537 fallback_mode: ToolProbeFallbackMode::Text,
538 failure_reason: None,
539 http_status,
540 elapsed_ms,
541 native_tool_call_count: native_count,
542 text_tool_call_count: text_count,
543 parser_errors: parsed.errors,
544 protocol_violations: parsed.violations,
545 content_sample: sample_content(&content),
546 };
547 }
548
549 let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
550 (
551 ToolProbeClassification::MalformedJsonArguments,
552 Some(first_non_empty(
553 parsed.errors.first().cloned(),
554 "malformed_tool_arguments",
555 )),
556 )
557 } else if content.trim().is_empty() && native_count == 0 {
558 (
559 ToolProbeClassification::EmptySilent,
560 Some("empty_silent_response".to_string()),
561 )
562 } else if has_raw_model_tool_tag(&content) {
563 (
564 ToolProbeClassification::RawModelToolTag,
565 Some("raw_tool_tag_no_structured_calls".to_string()),
566 )
567 } else {
568 (
569 ToolProbeClassification::ProseOnlyNonTool,
570 Some("no_executable_tool_call".to_string()),
571 )
572 };
573
574 ToolConformanceCase {
575 mode,
576 ok: false,
577 classification,
578 fallback_mode: ToolProbeFallbackMode::Disabled,
579 failure_reason,
580 http_status,
581 elapsed_ms,
582 native_tool_call_count: native_count,
583 text_tool_call_count: text_count,
584 parser_errors: parsed.errors,
585 protocol_violations: parsed.violations,
586 content_sample: sample_content(&content),
587 }
588}
589
590fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
591 let endpoint = if def.chat_endpoint.trim().is_empty() {
592 "/v1/chat/completions"
593 } else {
594 def.chat_endpoint.as_str()
595 };
596 let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
597 endpoint.to_string()
598 } else if endpoint.starts_with('/') {
599 format!("{}{}", base_url.trim_end_matches('/'), endpoint)
600 } else {
601 format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
602 };
603 reqwest::Url::parse(&url)
604 .map(|_| url.clone())
605 .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
606}
607
608fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
609 let prompt = format!(
610 "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
611 );
612 let tool = json!({
613 "type": "function",
614 "function": {
615 "name": TOOL_PROBE_TOOL_NAME,
616 "description": "Echo the probe marker exactly.",
617 "parameters": {
618 "type": "object",
619 "properties": {
620 "value": {
621 "type": "string",
622 "description": "The marker value to echo."
623 }
624 },
625 "required": ["value"],
626 "additionalProperties": false
627 }
628 }
629 });
630 let mut body = json!({
631 "model": model,
632 "messages": [{"role": "user", "content": prompt}],
633 "tools": [tool],
634 "stream": mode == ToolProbeMode::Streaming,
635 "temperature": 0,
636 });
637 if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
638 body["tool_choice"] = json!({
639 "type": "function",
640 "function": {"name": TOOL_PROBE_TOOL_NAME}
641 });
642 }
643 body
644}
645
646#[derive(Debug)]
647struct NativeToolCall {
648 name: String,
649 arguments: Option<Value>,
650}
651
652fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
653 let mut calls = Vec::new();
654 visit_native_tool_call_arrays(response, &mut calls);
655 calls
656}
657
658fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
659 match value {
660 Value::Object(map) => {
661 if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
662 for item in tool_calls {
663 if let Some(call) = parse_native_tool_call(item) {
664 calls.push(call);
665 }
666 }
667 }
668 for child in map.values() {
669 visit_native_tool_call_arrays(child, calls);
670 }
671 }
672 Value::Array(items) => {
673 for item in items {
674 visit_native_tool_call_arrays(item, calls);
675 }
676 }
677 _ => {}
678 }
679}
680
681fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
682 let obj = item.as_object()?;
683 let function = obj.get("function").and_then(Value::as_object);
684 let name = function
685 .and_then(|function| function.get("name"))
686 .or_else(|| obj.get("name"))
687 .and_then(Value::as_str)?
688 .to_string();
689 match crate::llm::tools::parse_text_tool_call_from_native_name(&name) {
690 crate::llm::tools::NativeToolNameTextCall::Parsed { name, arguments } => {
691 return Some(NativeToolCall {
692 name,
693 arguments: Some(arguments),
694 });
695 }
696 crate::llm::tools::NativeToolNameTextCall::Malformed { name, .. } => {
697 return Some(NativeToolCall {
698 name,
699 arguments: None,
700 });
701 }
702 crate::llm::tools::NativeToolNameTextCall::NotCall => {}
703 }
704 let raw_args = function
705 .and_then(|function| function.get("arguments"))
706 .or_else(|| obj.get("arguments"));
707 let arguments = match raw_args {
708 Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
709 Some(value @ Value::Object(_)) => Some(value.clone()),
710 Some(_) => None,
711 None => Some(json!({})),
712 };
713 Some(NativeToolCall { name, arguments })
714}
715
716fn extract_content(response: &Value) -> String {
717 let mut parts = Vec::new();
718 visit_content(response, &mut parts);
719 parts
720 .into_iter()
721 .filter(|part| !part.trim().is_empty())
722 .collect::<Vec<_>>()
723 .join("\n")
724}
725
726fn visit_content(value: &Value, parts: &mut Vec<String>) {
727 match value {
728 Value::Object(map) => {
729 for key in ["content", "response", "text"] {
730 if let Some(text) = map.get(key).and_then(Value::as_str) {
731 parts.push(text.to_string());
732 }
733 }
734 for child in map.values() {
735 visit_content(child, parts);
736 }
737 }
738 Value::Array(items) => {
739 for item in items {
740 visit_content(item, parts);
741 }
742 }
743 _ => {}
744 }
745}
746
747fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
748 let mut content = String::new();
749 let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
750 let mut frames = Vec::new();
751 for raw_line in text.lines() {
752 let line = raw_line.trim();
753 if line.is_empty() {
754 continue;
755 }
756 let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
757 if payload == "[DONE]" {
758 continue;
759 }
760 let Ok(frame) = serde_json::from_str::<Value>(payload) else {
761 continue;
762 };
763 collect_stream_content_and_calls(&frame, &mut content, &mut calls);
764 frames.push(frame);
765 }
766 let tool_calls: Vec<Value> = calls
767 .into_values()
768 .map(|call| {
769 json!({
770 "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
771 "type": "function",
772 "function": {
773 "name": call.name.unwrap_or_default(),
774 "arguments": call.arguments,
775 }
776 })
777 })
778 .collect();
779 json!({
780 "content": content,
781 "tool_calls": tool_calls,
782 "frames": frames,
783 })
784}
785
786#[derive(Debug, Default)]
787struct PartialStreamCall {
788 id: Option<String>,
789 name: Option<String>,
790 arguments: String,
791}
792
793fn collect_stream_content_and_calls(
794 frame: &Value,
795 content: &mut String,
796 calls: &mut BTreeMap<String, PartialStreamCall>,
797) {
798 if let Some(text) = frame
799 .pointer("/message/content")
800 .or_else(|| frame.pointer("/choices/0/delta/content"))
801 .or_else(|| frame.pointer("/choices/0/message/content"))
802 .or_else(|| frame.get("response"))
803 .and_then(Value::as_str)
804 {
805 content.push_str(text);
806 }
807 for item in frame
808 .pointer("/message/tool_calls")
809 .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
810 .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
811 .and_then(Value::as_array)
812 .into_iter()
813 .flatten()
814 {
815 let key = item
816 .get("index")
817 .and_then(Value::as_u64)
818 .map(|index| index.to_string())
819 .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
820 .unwrap_or_else(|| calls.len().to_string());
821 let slot = calls.entry(key).or_default();
822 if let Some(id) = item.get("id").and_then(Value::as_str) {
823 slot.id = Some(id.to_string());
824 }
825 if let Some(name) = item
826 .pointer("/function/name")
827 .or_else(|| item.get("name"))
828 .and_then(Value::as_str)
829 {
830 slot.name = Some(name.to_string());
831 }
832 if let Some(arguments) = item
833 .pointer("/function/arguments")
834 .or_else(|| item.get("arguments"))
835 {
836 match arguments {
837 Value::String(delta) => slot.arguments.push_str(delta),
838 Value::Object(_) => slot.arguments = arguments.to_string(),
839 _ => {}
840 }
841 }
842 }
843}
844
845fn probe_tool_registry() -> VmValue {
846 let mut value_param = BTreeMap::new();
847 value_param.insert("type".to_string(), vm_str("string"));
848 value_param.insert(
849 "description".to_string(),
850 vm_str("The marker value to echo."),
851 );
852 let mut params = BTreeMap::new();
853 params.insert("value".to_string(), VmValue::dict(value_param));
854 let tool = vm_dict(&[
855 ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
856 ("description", vm_str("Echo the probe marker exactly.")),
857 ("parameters", VmValue::dict(params)),
858 ]);
859 vm_dict(&[("tools", VmValue::List(std::sync::Arc::new(vec![tool])))])
860}
861
862fn vm_str(value: &str) -> VmValue {
863 VmValue::String(arcstr::ArcStr::from(value))
864}
865
866fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
867 let mut map = BTreeMap::new();
868 for (key, value) in pairs {
869 map.insert((*key).to_string(), value.clone());
870 }
871 VmValue::dict(map)
872}
873
874fn has_raw_model_tool_tag(content: &str) -> bool {
875 let lowered = content.to_ascii_lowercase();
876 lowered.contains("<tool_call")
877 || lowered.contains("<toolcall")
878 || lowered.contains("tool_code:")
879 || lowered.contains("tool_call:")
880 || lowered.contains("call:")
881 || lowered.contains("<function")
882}
883
884fn content_sample(response: &Value) -> Option<String> {
885 sample_content(&extract_content(response))
886}
887
888fn sample_content(content: &str) -> Option<String> {
889 let trimmed = content.trim();
890 if trimmed.is_empty() {
891 None
892 } else {
893 Some(trimmed.chars().take(240).collect())
894 }
895}
896
897fn sample_failure(text: &str, fallback: &str) -> String {
898 let trimmed = text.trim();
899 if trimmed.is_empty() {
900 fallback.to_string()
901 } else {
902 format!(
903 "{fallback}: {}",
904 trimmed.chars().take(240).collect::<String>()
905 )
906 }
907}
908
909fn first_non_empty(value: Option<String>, fallback: &str) -> String {
910 value
911 .filter(|value| !value.trim().is_empty())
912 .unwrap_or_else(|| fallback.to_string())
913}
914
915fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
916 clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
917}
918
919#[cfg(test)]
920mod tests {
921 use super::*;
922
923 #[test]
924 fn probe_resolves_catalog_key_to_provider_wire_model() {
925 let resolved = llm_config::resolve_model_info("baseten-glm-5.2");
926 assert_eq!(resolved_probe_model_id(&resolved.id), "zai-org/GLM-5.2");
927 }
928
929 #[test]
930 fn classify_openai_native_tool_call_as_pass() {
931 let report = classify_tool_conformance_fixture(
932 "local",
933 "model",
934 ToolProbeMode::NonStreaming,
935 DEFAULT_TOOL_PROBE_MARKER,
936 r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
937 );
938 assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
939 assert_eq!(
940 report.tool_calling.fallback_mode,
941 ToolProbeFallbackMode::Native
942 );
943 assert_eq!(
944 report.cases[0].classification,
945 ToolProbeClassification::StructuredNativeToolCall
946 );
947 }
948
949 #[test]
950 fn classify_native_tool_call_with_text_call_in_name_as_pass() {
951 let report = classify_tool_conformance_fixture(
952 "zai",
953 "glm-5",
954 ToolProbeMode::NonStreaming,
955 DEFAULT_TOOL_PROBE_MARKER,
956 r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker({ value: \"harn_tool_probe_marker\" })</arg_value>","arguments":"{}"}}]}}]}"#,
957 );
958
959 assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
960 assert_eq!(
961 report.tool_calling.fallback_mode,
962 ToolProbeFallbackMode::Native
963 );
964 assert_eq!(
965 report.cases[0].classification,
966 ToolProbeClassification::StructuredNativeToolCall
967 );
968 }
969
970 #[test]
971 fn classify_partial_text_call_in_native_name_as_malformed() {
972 let report = classify_tool_conformance_fixture(
973 "zai",
974 "glm-5",
975 ToolProbeMode::NonStreaming,
976 DEFAULT_TOOL_PROBE_MARKER,
977 r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker({ value: <<EOF","arguments":"{"}}]}}]}"#,
978 );
979
980 assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
981 assert_eq!(
982 report.cases[0].classification,
983 ToolProbeClassification::MalformedJsonArguments
984 );
985 }
986
987 #[test]
988 fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
989 let report = classify_tool_conformance_fixture(
990 "ollama",
991 "gemma4:26b",
992 ToolProbeMode::NonStreaming,
993 DEFAULT_TOOL_PROBE_MARKER,
994 r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
995 );
996 assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
997 assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
998 assert_eq!(
999 report.tool_calling.fallback_mode,
1000 ToolProbeFallbackMode::Text
1001 );
1002 assert_eq!(
1003 report.cases[0].classification,
1004 ToolProbeClassification::ParseableHarnTextToolCall
1005 );
1006 }
1007
1008 #[test]
1009 fn classify_qwen_call_colon_marker_as_text_fallback() {
1010 let report = classify_tool_conformance_fixture(
1011 "llamacpp",
1012 "qwen",
1013 ToolProbeMode::NonStreaming,
1014 DEFAULT_TOOL_PROBE_MARKER,
1015 r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
1016 );
1017 assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
1018 assert_eq!(
1019 report.tool_calling.fallback_mode,
1020 ToolProbeFallbackMode::Text
1021 );
1022 }
1023
1024 #[test]
1025 fn classify_prose_only_as_disabled() {
1026 let report = classify_tool_conformance_fixture(
1027 "ollama",
1028 "gemma4:26b",
1029 ToolProbeMode::NonStreaming,
1030 DEFAULT_TOOL_PROBE_MARKER,
1031 r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
1032 );
1033 assert_eq!(
1034 report.tool_calling.fallback_mode,
1035 ToolProbeFallbackMode::Disabled
1036 );
1037 assert_eq!(
1038 report.cases[0].classification,
1039 ToolProbeClassification::ProseOnlyNonTool
1040 );
1041 assert_eq!(
1042 report.cases[0].failure_reason.as_deref(),
1043 Some("no_executable_tool_call")
1044 );
1045 }
1046
1047 #[test]
1048 fn aggregates_openai_streaming_tool_call_deltas() {
1049 let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
1050 data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
1051 data: [DONE]\n";
1052 let response = aggregate_stream_text(raw, "local");
1053 let case = classify_tool_probe_response(
1054 ToolProbeMode::Streaming,
1055 &response,
1056 DEFAULT_TOOL_PROBE_MARKER,
1057 None,
1058 None,
1059 );
1060 assert!(case.ok, "{case:?}");
1061 assert_eq!(
1062 case.classification,
1063 ToolProbeClassification::StructuredNativeToolCall
1064 );
1065 }
1066
1067 #[test]
1068 fn report_satisfies_tool_probe_when_text_fallback_passes() {
1069 let report = classify_tool_conformance_fixture(
1070 "llamacpp",
1071 "qwen",
1072 ToolProbeMode::NonStreaming,
1073 DEFAULT_TOOL_PROBE_MARKER,
1074 r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
1075 );
1076 assert!(report_satisfies_required_probe(&report, "tool_probe"));
1077 assert!(!report_satisfies_required_probe(
1078 &report,
1079 "native_tool_probe"
1080 ));
1081 }
1082
1083 #[test]
1084 fn summary_requires_every_repeated_native_case_to_pass() {
1085 let summary = summarize_cases(&[
1086 probe_case(
1087 ToolProbeMode::NonStreaming,
1088 true,
1089 ToolProbeClassification::StructuredNativeToolCall,
1090 ),
1091 probe_case(
1092 ToolProbeMode::NonStreaming,
1093 false,
1094 ToolProbeClassification::ProseOnlyNonTool,
1095 ),
1096 ]);
1097 assert_eq!(summary.native, ToolProbeStatus::Fail);
1098 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1099 }
1100
1101 #[test]
1102 fn summary_requires_every_repeated_text_case_to_pass() {
1103 let summary = summarize_cases(&[
1104 probe_case(
1105 ToolProbeMode::NonStreaming,
1106 true,
1107 ToolProbeClassification::ParseableHarnTextToolCall,
1108 ),
1109 probe_case(
1110 ToolProbeMode::NonStreaming,
1111 false,
1112 ToolProbeClassification::MalformedJsonArguments,
1113 ),
1114 ]);
1115 assert_eq!(summary.native, ToolProbeStatus::Fail);
1116 assert_eq!(summary.text, ToolProbeStatus::Fail);
1117 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1118 }
1119
1120 #[test]
1121 fn summary_preserves_nonstreaming_text_fallback_when_streaming_fails() {
1122 let summary = summarize_cases(&[
1123 probe_case(
1124 ToolProbeMode::NonStreaming,
1125 true,
1126 ToolProbeClassification::ParseableHarnTextToolCall,
1127 ),
1128 probe_case(
1129 ToolProbeMode::Streaming,
1130 false,
1131 ToolProbeClassification::ProseOnlyNonTool,
1132 ),
1133 ]);
1134 assert_eq!(summary.native, ToolProbeStatus::Fail);
1135 assert_eq!(summary.streaming_native, ToolProbeStatus::Fail);
1136 assert_eq!(summary.text, ToolProbeStatus::Pass);
1137 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Text);
1138 }
1139
1140 fn probe_case(
1141 mode: ToolProbeMode,
1142 ok: bool,
1143 classification: ToolProbeClassification,
1144 ) -> ToolConformanceCase {
1145 let native_tool_call_count =
1146 usize::from(classification == ToolProbeClassification::StructuredNativeToolCall);
1147 let text_tool_call_count =
1148 usize::from(classification == ToolProbeClassification::ParseableHarnTextToolCall);
1149 ToolConformanceCase {
1150 mode,
1151 ok,
1152 classification,
1153 fallback_mode: ToolProbeFallbackMode::Disabled,
1154 failure_reason: None,
1155 http_status: None,
1156 elapsed_ms: None,
1157 native_tool_call_count,
1158 text_tool_call_count,
1159 parser_errors: Vec::new(),
1160 protocol_violations: Vec::new(),
1161 content_sample: None,
1162 }
1163 }
1164}