1use std::collections::BTreeMap;
9
10use serde::{Deserialize, Serialize};
11use serde_json::{json, Value};
12
13use crate::llm_config::{self, ProviderDef};
14use crate::value::VmValue;
15
16pub const TOOL_CONFORMANCE_SCHEMA_VERSION: u32 = 1;
17pub const TOOL_PROBE_TOOL_NAME: &str = "echo_marker";
18pub const DEFAULT_TOOL_PROBE_MARKER: &str = "harn_tool_probe_marker";
19
20#[derive(Debug, Clone)]
21pub struct ToolConformanceProbeOptions {
22 pub provider: String,
23 pub model: String,
24 pub base_url: Option<String>,
25 pub modes: Vec<ToolProbeMode>,
26 pub marker: String,
27 pub repeat: usize,
28 pub timeout_secs: u64,
29}
30
31impl ToolConformanceProbeOptions {
32 pub fn new(provider: impl Into<String>, model: impl Into<String>) -> Self {
33 Self {
34 provider: provider.into(),
35 model: model.into(),
36 base_url: None,
37 modes: vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming],
38 marker: DEFAULT_TOOL_PROBE_MARKER.to_string(),
39 repeat: 1,
40 timeout_secs: 120,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ToolProbeMode {
48 NonStreaming,
49 Streaming,
50}
51
52impl ToolProbeMode {
53 pub fn as_str(self) -> &'static str {
54 match self {
55 Self::NonStreaming => "non_streaming",
56 Self::Streaming => "streaming",
57 }
58 }
59}
60
61#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
62#[serde(rename_all = "snake_case")]
63pub enum ToolProbeClassification {
64 StructuredNativeToolCall,
65 ParseableHarnTextToolCall,
66 RawModelToolTag,
67 ProseOnlyNonTool,
68 MalformedJsonArguments,
69 EmptySilent,
70 HttpError,
71 TransportError,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
75#[serde(rename_all = "snake_case")]
76pub enum ToolProbeStatus {
77 Pass,
78 Fail,
79 Unknown,
80}
81
82impl ToolProbeStatus {
83 pub fn as_str(&self) -> &'static str {
84 match self {
85 Self::Pass => "pass",
86 Self::Fail => "fail",
87 Self::Unknown => "unknown",
88 }
89 }
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
93#[serde(rename_all = "snake_case")]
94pub enum ToolProbeFallbackMode {
95 Native,
96 Text,
97 Disabled,
98}
99
100impl ToolProbeFallbackMode {
101 pub fn as_str(&self) -> &'static str {
102 match self {
103 Self::Native => "native",
104 Self::Text => "text",
105 Self::Disabled => "disabled",
106 }
107 }
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct ToolConformanceReport {
112 pub schema_version: u32,
113 pub provider: String,
114 pub model: String,
115 #[serde(skip_serializing_if = "Option::is_none")]
116 pub base_url: Option<String>,
117 pub tool_name: String,
118 pub marker: String,
119 pub cases: Vec<ToolConformanceCase>,
120 pub tool_calling: ToolCallingConformanceSummary,
121}
122
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct ToolCallingConformanceSummary {
125 pub native: ToolProbeStatus,
126 pub text: ToolProbeStatus,
127 pub streaming_native: ToolProbeStatus,
128 pub fallback_mode: ToolProbeFallbackMode,
129 #[serde(skip_serializing_if = "Option::is_none")]
130 pub failure_reason: Option<String>,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct ToolConformanceCase {
135 pub mode: ToolProbeMode,
136 pub ok: bool,
137 pub classification: ToolProbeClassification,
138 pub fallback_mode: ToolProbeFallbackMode,
139 #[serde(skip_serializing_if = "Option::is_none")]
140 pub failure_reason: Option<String>,
141 #[serde(skip_serializing_if = "Option::is_none")]
142 pub http_status: Option<u16>,
143 #[serde(skip_serializing_if = "Option::is_none")]
144 pub elapsed_ms: Option<u64>,
145 pub native_tool_call_count: usize,
146 pub text_tool_call_count: usize,
147 #[serde(skip_serializing_if = "Vec::is_empty")]
148 pub parser_errors: Vec<String>,
149 #[serde(skip_serializing_if = "Vec::is_empty")]
150 pub protocol_violations: Vec<String>,
151 #[serde(skip_serializing_if = "Option::is_none")]
152 pub content_sample: Option<String>,
153}
154
155impl ToolConformanceCase {
156 fn transport_error(mode: ToolProbeMode, message: String, elapsed_ms: Option<u64>) -> Self {
157 Self {
158 mode,
159 ok: false,
160 classification: ToolProbeClassification::TransportError,
161 fallback_mode: ToolProbeFallbackMode::Disabled,
162 failure_reason: Some(message),
163 http_status: None,
164 elapsed_ms,
165 native_tool_call_count: 0,
166 text_tool_call_count: 0,
167 parser_errors: Vec::new(),
168 protocol_violations: Vec::new(),
169 content_sample: None,
170 }
171 }
172
173 fn http_error(
174 mode: ToolProbeMode,
175 status: u16,
176 message: String,
177 elapsed_ms: Option<u64>,
178 ) -> Self {
179 Self {
180 mode,
181 ok: false,
182 classification: ToolProbeClassification::HttpError,
183 fallback_mode: ToolProbeFallbackMode::Disabled,
184 failure_reason: Some(message),
185 http_status: Some(status),
186 elapsed_ms,
187 native_tool_call_count: 0,
188 text_tool_call_count: 0,
189 parser_errors: Vec::new(),
190 protocol_violations: Vec::new(),
191 content_sample: None,
192 }
193 }
194}
195
196pub async fn run_tool_conformance_probe(
197 options: ToolConformanceProbeOptions,
198) -> ToolConformanceReport {
199 let model = llm_config::resolve_model_info(&options.model);
200 let provider = if options.provider.trim().is_empty() {
201 model.provider.clone()
202 } else {
203 options.provider.clone()
204 };
205 let model_id = resolved_probe_model_id(&model.id);
206 let base_url = options.base_url.clone().or_else(|| {
207 llm_config::provider_config(&provider).map(|def| llm_config::resolve_base_url(&def))
208 });
209 let mut cases = Vec::new();
210 let modes = normalized_modes(&options.modes);
211 for _ in 0..options.repeat.max(1) {
212 for mode in &modes {
213 cases.push(
214 execute_live_probe_case(
215 &provider,
216 &model_id,
217 base_url.as_deref(),
218 *mode,
219 &options.marker,
220 options.timeout_secs,
221 )
222 .await,
223 );
224 }
225 }
226 report_from_cases(provider, model_id, base_url, options.marker, cases)
227}
228
229fn resolved_probe_model_id(selector: &str) -> String {
230 llm_config::wire_model_id(selector)
231}
232
233pub fn classify_tool_conformance_fixture(
234 provider: impl Into<String>,
235 model: impl Into<String>,
236 mode: ToolProbeMode,
237 marker: impl Into<String>,
238 raw: &str,
239) -> ToolConformanceReport {
240 let marker = marker.into();
241 let response = serde_json::from_str::<Value>(raw).unwrap_or_else(|_| json!({ "content": raw }));
242 let case = classify_tool_probe_response(mode, &response, &marker, None, None);
243 report_from_cases(provider.into(), model.into(), None, marker, vec![case])
244}
245
246pub fn report_satisfies_required_probe(report: &ToolConformanceReport, requirement: &str) -> bool {
247 match requirement {
248 "tool_probe" | "tool_call_probe" => {
249 report.tool_calling.fallback_mode != ToolProbeFallbackMode::Disabled
250 && report.cases.iter().any(|case| case.ok)
251 }
252 "native_tool_probe" => report.tool_calling.native == ToolProbeStatus::Pass,
253 "streaming_tool_probe" => report.tool_calling.streaming_native == ToolProbeStatus::Pass,
254 _ => false,
255 }
256}
257
258fn normalized_modes(modes: &[ToolProbeMode]) -> Vec<ToolProbeMode> {
259 if modes.is_empty() {
260 return vec![ToolProbeMode::NonStreaming, ToolProbeMode::Streaming];
261 }
262 let mut out = Vec::new();
263 for mode in modes {
264 if !out.contains(mode) {
265 out.push(*mode);
266 }
267 }
268 out
269}
270
271fn report_from_cases(
272 provider: String,
273 model: String,
274 base_url: Option<String>,
275 marker: String,
276 cases: Vec<ToolConformanceCase>,
277) -> ToolConformanceReport {
278 let summary = summarize_cases(&cases);
279 ToolConformanceReport {
280 schema_version: TOOL_CONFORMANCE_SCHEMA_VERSION,
281 provider,
282 model,
283 base_url,
284 tool_name: TOOL_PROBE_TOOL_NAME.to_string(),
285 marker,
286 cases,
287 tool_calling: summary,
288 }
289}
290
291fn summarize_cases(cases: &[ToolConformanceCase]) -> ToolCallingConformanceSummary {
292 let native = summarize_native_mode(cases, ToolProbeMode::NonStreaming);
293 let streaming_native = summarize_native_mode(cases, ToolProbeMode::Streaming);
294 let text = summarize_text_mode(cases);
295
296 let fallback_mode =
297 if native == ToolProbeStatus::Pass || streaming_native == ToolProbeStatus::Pass {
298 ToolProbeFallbackMode::Native
299 } else if text == ToolProbeStatus::Pass {
300 ToolProbeFallbackMode::Text
301 } else {
302 ToolProbeFallbackMode::Disabled
303 };
304
305 let failure_reason = if fallback_mode == ToolProbeFallbackMode::Disabled {
306 cases.iter().find_map(|case| case.failure_reason.clone())
307 } else {
308 None
309 };
310
311 ToolCallingConformanceSummary {
312 native,
313 text,
314 streaming_native,
315 fallback_mode,
316 failure_reason,
317 }
318}
319
320fn summarize_native_mode(cases: &[ToolConformanceCase], mode: ToolProbeMode) -> ToolProbeStatus {
321 let mut saw_mode = false;
322 let mut all_passed = true;
323 for case in cases.iter().filter(|case| case.mode == mode) {
324 saw_mode = true;
325 if !(case.ok && case.classification == ToolProbeClassification::StructuredNativeToolCall) {
326 all_passed = false;
327 }
328 }
329 match (saw_mode, all_passed) {
330 (false, _) => ToolProbeStatus::Unknown,
331 (true, true) => ToolProbeStatus::Pass,
332 (true, false) => ToolProbeStatus::Fail,
333 }
334}
335
336fn summarize_text_mode(cases: &[ToolConformanceCase]) -> ToolProbeStatus {
337 let mut saw_text = false;
338 let mut saw_passing_mode = false;
339 for mode in [ToolProbeMode::NonStreaming, ToolProbeMode::Streaming] {
340 let mut saw_mode = false;
341 let mut saw_text_in_mode = false;
342 let mut all_mode_cases_passed = true;
343 for case in cases.iter().filter(|case| case.mode == mode) {
344 saw_mode = true;
345 saw_text_in_mode |= case.classification
346 == ToolProbeClassification::ParseableHarnTextToolCall
347 || case.text_tool_call_count > 0;
348 if !(case.ok
349 && case.classification == ToolProbeClassification::ParseableHarnTextToolCall)
350 {
351 all_mode_cases_passed = false;
352 }
353 }
354 saw_text |= saw_text_in_mode;
355 if saw_mode && saw_text_in_mode && all_mode_cases_passed {
356 saw_passing_mode = true;
357 }
358 }
359 if !saw_text {
360 return ToolProbeStatus::Unknown;
361 }
362 if saw_passing_mode {
363 ToolProbeStatus::Pass
364 } else {
365 ToolProbeStatus::Fail
366 }
367}
368
369async fn execute_live_probe_case(
370 provider: &str,
371 model: &str,
372 base_url: Option<&str>,
373 mode: ToolProbeMode,
374 marker: &str,
375 timeout_secs: u64,
376) -> ToolConformanceCase {
377 let clock = harn_clock::RealClock::arc();
378 let started_ms = clock.monotonic_ms();
379 let Some(def) = llm_config::provider_config(provider) else {
380 return ToolConformanceCase::transport_error(
381 mode,
382 format!("unknown provider: {provider}"),
383 Some(elapsed_ms(&*clock, started_ms)),
384 );
385 };
386 let base_url = base_url
387 .filter(|value| !value.trim().is_empty())
388 .map(str::to_string)
389 .unwrap_or_else(|| llm_config::resolve_base_url(&def));
390 let url = match chat_url(&def, &base_url) {
391 Ok(url) => url,
392 Err(message) => {
393 return ToolConformanceCase::transport_error(
394 mode,
395 message,
396 Some(elapsed_ms(&*clock, started_ms)),
397 );
398 }
399 };
400 let body = probe_request_body(provider, model, mode, marker);
401 let client = if mode == ToolProbeMode::Streaming {
402 crate::llm::shared_streaming_client().clone()
403 } else {
404 crate::llm::shared_blocking_client().clone()
405 };
406 let api_key = crate::llm::helpers::resolve_api_key(provider).unwrap_or_default();
407 let request = client
408 .post(&url)
409 .header("Content-Type", "application/json")
410 .timeout(std::time::Duration::from_secs(timeout_secs))
411 .json(&body);
412 let mut request = crate::llm::api::apply_auth_headers(request, &api_key, Some(&def));
413 for (name, value) in &def.extra_headers {
414 request = request.header(name.as_str(), value.as_str());
415 }
416
417 let response = match request.send().await {
418 Ok(response) => response,
419 Err(error) => {
420 return ToolConformanceCase::transport_error(
421 mode,
422 format!("provider request failed: {error}"),
423 Some(elapsed_ms(&*clock, started_ms)),
424 );
425 }
426 };
427 let status = response.status();
428 let text = match response.text().await {
429 Ok(text) => text,
430 Err(error) => {
431 return ToolConformanceCase::transport_error(
432 mode,
433 format!("provider response was unreadable: {error}"),
434 Some(elapsed_ms(&*clock, started_ms)),
435 );
436 }
437 };
438 let elapsed = Some(elapsed_ms(&*clock, started_ms));
439 if !status.is_success() {
440 return ToolConformanceCase::http_error(
441 mode,
442 status.as_u16(),
443 sample_failure(&text, "provider returned non-success HTTP status"),
444 elapsed,
445 );
446 }
447 let response_value = if mode == ToolProbeMode::Streaming {
448 aggregate_stream_text(&text, provider)
449 } else {
450 serde_json::from_str::<Value>(&text).unwrap_or_else(|_| json!({ "content": text }))
451 };
452 classify_tool_probe_response(
453 mode,
454 &response_value,
455 marker,
456 Some(status.as_u16()),
457 elapsed,
458 )
459}
460
461fn probe_marker_present(calls: &[Value], marker: &str) -> bool {
465 calls.iter().any(|call| {
466 call.get("name").and_then(Value::as_str) == Some(TOOL_PROBE_TOOL_NAME)
467 && call
468 .get("arguments")
469 .and_then(|args| args.get("value"))
470 .and_then(Value::as_str)
471 == Some(marker)
472 })
473}
474
475fn classify_tool_probe_response(
476 mode: ToolProbeMode,
477 response: &Value,
478 marker: &str,
479 http_status: Option<u16>,
480 elapsed_ms: Option<u64>,
481) -> ToolConformanceCase {
482 let native = extract_native_tool_calls(response);
483 let native_count = native.len();
484 let mut malformed_native = false;
485 for call in &native {
486 if call.name == TOOL_PROBE_TOOL_NAME {
487 match &call.arguments {
488 Some(Value::Object(map))
489 if map.get("value").and_then(Value::as_str) == Some(marker) =>
490 {
491 return ToolConformanceCase {
492 mode,
493 ok: true,
494 classification: ToolProbeClassification::StructuredNativeToolCall,
495 fallback_mode: ToolProbeFallbackMode::Native,
496 failure_reason: None,
497 http_status,
498 elapsed_ms,
499 native_tool_call_count: native_count,
500 text_tool_call_count: 0,
501 parser_errors: Vec::new(),
502 protocol_violations: Vec::new(),
503 content_sample: content_sample(response),
504 };
505 }
506 Some(Value::Object(_)) => {}
507 _ => malformed_native = true,
508 }
509 }
510 }
511
512 let content = extract_content(response);
513 let tools = probe_tool_registry();
514 let tagged = crate::llm::tools::parse_text_tool_calls_with_tools(&content, Some(&tools));
520 let parsed = if probe_marker_present(&tagged.calls, marker) {
521 tagged
522 } else {
523 let fenced = crate::llm::tools::parse_fenced_json_tool_calls(&content);
524 if probe_marker_present(&fenced.calls, marker) {
525 fenced
526 } else {
527 tagged
528 }
529 };
530 let text_count = parsed.calls.len();
531 let text_pass = probe_marker_present(&parsed.calls, marker);
532 if text_pass {
533 return ToolConformanceCase {
534 mode,
535 ok: true,
536 classification: ToolProbeClassification::ParseableHarnTextToolCall,
537 fallback_mode: ToolProbeFallbackMode::Text,
538 failure_reason: None,
539 http_status,
540 elapsed_ms,
541 native_tool_call_count: native_count,
542 text_tool_call_count: text_count,
543 parser_errors: parsed.errors,
544 protocol_violations: parsed.violations,
545 content_sample: sample_content(&content),
546 };
547 }
548
549 let (classification, failure_reason) = if malformed_native || !parsed.errors.is_empty() {
550 (
551 ToolProbeClassification::MalformedJsonArguments,
552 Some(first_non_empty(
553 parsed.errors.first().cloned(),
554 "malformed_tool_arguments",
555 )),
556 )
557 } else if content.trim().is_empty() && native_count == 0 {
558 (
559 ToolProbeClassification::EmptySilent,
560 Some("empty_silent_response".to_string()),
561 )
562 } else if has_raw_model_tool_tag(&content) {
563 (
564 ToolProbeClassification::RawModelToolTag,
565 Some("raw_tool_tag_no_structured_calls".to_string()),
566 )
567 } else {
568 (
569 ToolProbeClassification::ProseOnlyNonTool,
570 Some("no_executable_tool_call".to_string()),
571 )
572 };
573
574 ToolConformanceCase {
575 mode,
576 ok: false,
577 classification,
578 fallback_mode: ToolProbeFallbackMode::Disabled,
579 failure_reason,
580 http_status,
581 elapsed_ms,
582 native_tool_call_count: native_count,
583 text_tool_call_count: text_count,
584 parser_errors: parsed.errors,
585 protocol_violations: parsed.violations,
586 content_sample: sample_content(&content),
587 }
588}
589
590fn chat_url(def: &ProviderDef, base_url: &str) -> Result<String, String> {
591 let endpoint = if def.chat_endpoint.trim().is_empty() {
592 "/v1/chat/completions"
593 } else {
594 def.chat_endpoint.as_str()
595 };
596 let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
597 endpoint.to_string()
598 } else if endpoint.starts_with('/') {
599 format!("{}{}", base_url.trim_end_matches('/'), endpoint)
600 } else {
601 format!("{}/{}", base_url.trim_end_matches('/'), endpoint)
602 };
603 reqwest::Url::parse(&url)
604 .map(|_| url.clone())
605 .map_err(|error| format!("invalid provider chat URL '{url}': {error}"))
606}
607
608fn probe_request_body(provider: &str, model: &str, mode: ToolProbeMode, marker: &str) -> Value {
609 let prompt = format!(
610 "Call the {TOOL_PROBE_TOOL_NAME} tool exactly once with value {marker:?}. Do not answer in prose."
611 );
612 let tool = json!({
613 "type": "function",
614 "function": {
615 "name": TOOL_PROBE_TOOL_NAME,
616 "description": "Echo the probe marker exactly.",
617 "parameters": {
618 "type": "object",
619 "properties": {
620 "value": {
621 "type": "string",
622 "description": "The marker value to echo."
623 }
624 },
625 "required": ["value"],
626 "additionalProperties": false
627 }
628 }
629 });
630 let mut body = json!({
631 "model": model,
632 "messages": [{"role": "user", "content": prompt}],
633 "tools": [tool],
634 "stream": mode == ToolProbeMode::Streaming,
635 "temperature": 0,
636 });
637 if !crate::llm::provider::provider_uses_ollama_messages(provider, model) {
638 body["tool_choice"] = json!({
639 "type": "function",
640 "function": {"name": TOOL_PROBE_TOOL_NAME}
641 });
642 }
643 body
644}
645
646#[derive(Debug)]
647struct NativeToolCall {
648 name: String,
649 arguments: Option<Value>,
650}
651
652fn extract_native_tool_calls(response: &Value) -> Vec<NativeToolCall> {
653 let mut calls = Vec::new();
654 visit_native_tool_call_arrays(response, &mut calls);
655 calls
656}
657
658fn visit_native_tool_call_arrays(value: &Value, calls: &mut Vec<NativeToolCall>) {
659 match value {
660 Value::Object(map) => {
661 if let Some(tool_calls) = map.get("tool_calls").and_then(Value::as_array) {
662 for item in tool_calls {
663 if let Some(call) = parse_native_tool_call(item) {
664 calls.push(call);
665 }
666 }
667 }
668 for child in map.values() {
669 visit_native_tool_call_arrays(child, calls);
670 }
671 }
672 Value::Array(items) => {
673 for item in items {
674 visit_native_tool_call_arrays(item, calls);
675 }
676 }
677 _ => {}
678 }
679}
680
681fn parse_native_tool_call(item: &Value) -> Option<NativeToolCall> {
682 let obj = item.as_object()?;
683 let function = obj.get("function").and_then(Value::as_object);
684 let name = function
685 .and_then(|function| function.get("name"))
686 .or_else(|| obj.get("name"))
687 .and_then(Value::as_str)?
688 .to_string();
689 let raw_args = function
690 .and_then(|function| function.get("arguments"))
691 .or_else(|| obj.get("arguments"));
692 let arguments = match raw_args {
693 Some(Value::String(raw)) => serde_json::from_str::<Value>(raw).ok(),
694 Some(value @ Value::Object(_)) => Some(value.clone()),
695 Some(_) => None,
696 None => Some(json!({})),
697 };
698 Some(NativeToolCall { name, arguments })
699}
700
701fn extract_content(response: &Value) -> String {
702 let mut parts = Vec::new();
703 visit_content(response, &mut parts);
704 parts
705 .into_iter()
706 .filter(|part| !part.trim().is_empty())
707 .collect::<Vec<_>>()
708 .join("\n")
709}
710
711fn visit_content(value: &Value, parts: &mut Vec<String>) {
712 match value {
713 Value::Object(map) => {
714 for key in ["content", "response", "text"] {
715 if let Some(text) = map.get(key).and_then(Value::as_str) {
716 parts.push(text.to_string());
717 }
718 }
719 for child in map.values() {
720 visit_content(child, parts);
721 }
722 }
723 Value::Array(items) => {
724 for item in items {
725 visit_content(item, parts);
726 }
727 }
728 _ => {}
729 }
730}
731
732fn aggregate_stream_text(text: &str, _provider: &str) -> Value {
733 let mut content = String::new();
734 let mut calls: BTreeMap<String, PartialStreamCall> = BTreeMap::new();
735 let mut frames = Vec::new();
736 for raw_line in text.lines() {
737 let line = raw_line.trim();
738 if line.is_empty() {
739 continue;
740 }
741 let payload = line.strip_prefix("data:").map(str::trim).unwrap_or(line);
742 if payload == "[DONE]" {
743 continue;
744 }
745 let Ok(frame) = serde_json::from_str::<Value>(payload) else {
746 continue;
747 };
748 collect_stream_content_and_calls(&frame, &mut content, &mut calls);
749 frames.push(frame);
750 }
751 let tool_calls: Vec<Value> = calls
752 .into_values()
753 .map(|call| {
754 json!({
755 "id": call.id.unwrap_or_else(|| "stream_tool".to_string()),
756 "type": "function",
757 "function": {
758 "name": call.name.unwrap_or_default(),
759 "arguments": call.arguments,
760 }
761 })
762 })
763 .collect();
764 json!({
765 "content": content,
766 "tool_calls": tool_calls,
767 "frames": frames,
768 })
769}
770
771#[derive(Debug, Default)]
772struct PartialStreamCall {
773 id: Option<String>,
774 name: Option<String>,
775 arguments: String,
776}
777
778fn collect_stream_content_and_calls(
779 frame: &Value,
780 content: &mut String,
781 calls: &mut BTreeMap<String, PartialStreamCall>,
782) {
783 if let Some(text) = frame
784 .pointer("/message/content")
785 .or_else(|| frame.pointer("/choices/0/delta/content"))
786 .or_else(|| frame.pointer("/choices/0/message/content"))
787 .or_else(|| frame.get("response"))
788 .and_then(Value::as_str)
789 {
790 content.push_str(text);
791 }
792 for item in frame
793 .pointer("/message/tool_calls")
794 .or_else(|| frame.pointer("/choices/0/delta/tool_calls"))
795 .or_else(|| frame.pointer("/choices/0/message/tool_calls"))
796 .and_then(Value::as_array)
797 .into_iter()
798 .flatten()
799 {
800 let key = item
801 .get("index")
802 .and_then(Value::as_u64)
803 .map(|index| index.to_string())
804 .or_else(|| item.get("id").and_then(Value::as_str).map(str::to_string))
805 .unwrap_or_else(|| calls.len().to_string());
806 let slot = calls.entry(key).or_default();
807 if let Some(id) = item.get("id").and_then(Value::as_str) {
808 slot.id = Some(id.to_string());
809 }
810 if let Some(name) = item
811 .pointer("/function/name")
812 .or_else(|| item.get("name"))
813 .and_then(Value::as_str)
814 {
815 slot.name = Some(name.to_string());
816 }
817 if let Some(arguments) = item
818 .pointer("/function/arguments")
819 .or_else(|| item.get("arguments"))
820 {
821 match arguments {
822 Value::String(delta) => slot.arguments.push_str(delta),
823 Value::Object(_) => slot.arguments = arguments.to_string(),
824 _ => {}
825 }
826 }
827 }
828}
829
830fn probe_tool_registry() -> VmValue {
831 let mut value_param = BTreeMap::new();
832 value_param.insert("type".to_string(), vm_str("string"));
833 value_param.insert(
834 "description".to_string(),
835 vm_str("The marker value to echo."),
836 );
837 let mut params = BTreeMap::new();
838 params.insert("value".to_string(), VmValue::dict(value_param));
839 let tool = vm_dict(&[
840 ("name", vm_str(TOOL_PROBE_TOOL_NAME)),
841 ("description", vm_str("Echo the probe marker exactly.")),
842 ("parameters", VmValue::dict(params)),
843 ]);
844 vm_dict(&[("tools", VmValue::List(std::sync::Arc::new(vec![tool])))])
845}
846
847fn vm_str(value: &str) -> VmValue {
848 VmValue::String(arcstr::ArcStr::from(value))
849}
850
851fn vm_dict(pairs: &[(&str, VmValue)]) -> VmValue {
852 let mut map = BTreeMap::new();
853 for (key, value) in pairs {
854 map.insert((*key).to_string(), value.clone());
855 }
856 VmValue::dict(map)
857}
858
859fn has_raw_model_tool_tag(content: &str) -> bool {
860 let lowered = content.to_ascii_lowercase();
861 lowered.contains("<tool_call")
862 || lowered.contains("<toolcall")
863 || lowered.contains("tool_code:")
864 || lowered.contains("tool_call:")
865 || lowered.contains("call:")
866 || lowered.contains("<function")
867}
868
869fn content_sample(response: &Value) -> Option<String> {
870 sample_content(&extract_content(response))
871}
872
873fn sample_content(content: &str) -> Option<String> {
874 let trimmed = content.trim();
875 if trimmed.is_empty() {
876 None
877 } else {
878 Some(trimmed.chars().take(240).collect())
879 }
880}
881
882fn sample_failure(text: &str, fallback: &str) -> String {
883 let trimmed = text.trim();
884 if trimmed.is_empty() {
885 fallback.to_string()
886 } else {
887 format!(
888 "{fallback}: {}",
889 trimmed.chars().take(240).collect::<String>()
890 )
891 }
892}
893
894fn first_non_empty(value: Option<String>, fallback: &str) -> String {
895 value
896 .filter(|value| !value.trim().is_empty())
897 .unwrap_or_else(|| fallback.to_string())
898}
899
900fn elapsed_ms(clock: &dyn harn_clock::Clock, started_ms: i64) -> u64 {
901 clock.monotonic_ms().saturating_sub(started_ms).max(0) as u64
902}
903
904#[cfg(test)]
905mod tests {
906 use super::*;
907
908 #[test]
909 fn probe_resolves_catalog_key_to_provider_wire_model() {
910 let resolved = llm_config::resolve_model_info("baseten-glm-5.2");
911 assert_eq!(resolved_probe_model_id(&resolved.id), "zai-org/GLM-5.2");
912 }
913
914 #[test]
915 fn classify_openai_native_tool_call_as_pass() {
916 let report = classify_tool_conformance_fixture(
917 "local",
918 "model",
919 ToolProbeMode::NonStreaming,
920 DEFAULT_TOOL_PROBE_MARKER,
921 r#"{"choices":[{"message":{"tool_calls":[{"id":"call_1","type":"function","function":{"name":"echo_marker","arguments":"{\"value\":\"harn_tool_probe_marker\"}"}}]}}]}"#,
922 );
923 assert_eq!(report.tool_calling.native, ToolProbeStatus::Pass);
924 assert_eq!(
925 report.tool_calling.fallback_mode,
926 ToolProbeFallbackMode::Native
927 );
928 assert_eq!(
929 report.cases[0].classification,
930 ToolProbeClassification::StructuredNativeToolCall
931 );
932 }
933
934 #[test]
935 fn classify_gemma_raw_json_tool_call_content_as_text_fallback() {
936 let report = classify_tool_conformance_fixture(
937 "ollama",
938 "gemma4:26b",
939 ToolProbeMode::NonStreaming,
940 DEFAULT_TOOL_PROBE_MARKER,
941 r#"{"message":{"content":"<tool_call>{\"name\":\"echo_marker\",\"arguments\":{\"value\":\"harn_tool_probe_marker\"}}</tool_call>"}}"#,
942 );
943 assert_eq!(report.tool_calling.native, ToolProbeStatus::Fail);
944 assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
945 assert_eq!(
946 report.tool_calling.fallback_mode,
947 ToolProbeFallbackMode::Text
948 );
949 assert_eq!(
950 report.cases[0].classification,
951 ToolProbeClassification::ParseableHarnTextToolCall
952 );
953 }
954
955 #[test]
956 fn classify_qwen_call_colon_marker_as_text_fallback() {
957 let report = classify_tool_conformance_fixture(
958 "llamacpp",
959 "qwen",
960 ToolProbeMode::NonStreaming,
961 DEFAULT_TOOL_PROBE_MARKER,
962 r#"{"content":"call:echo_marker{ value: \"harn_tool_probe_marker\" }"}"#,
963 );
964 assert_eq!(report.tool_calling.text, ToolProbeStatus::Pass);
965 assert_eq!(
966 report.tool_calling.fallback_mode,
967 ToolProbeFallbackMode::Text
968 );
969 }
970
971 #[test]
972 fn classify_prose_only_as_disabled() {
973 let report = classify_tool_conformance_fixture(
974 "ollama",
975 "gemma4:26b",
976 ToolProbeMode::NonStreaming,
977 DEFAULT_TOOL_PROBE_MARKER,
978 r#"{"message":{"content":"The comment has been added. I will now verify it."}}"#,
979 );
980 assert_eq!(
981 report.tool_calling.fallback_mode,
982 ToolProbeFallbackMode::Disabled
983 );
984 assert_eq!(
985 report.cases[0].classification,
986 ToolProbeClassification::ProseOnlyNonTool
987 );
988 assert_eq!(
989 report.cases[0].failure_reason.as_deref(),
990 Some("no_executable_tool_call")
991 );
992 }
993
994 #[test]
995 fn aggregates_openai_streaming_tool_call_deltas() {
996 let raw = "data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\",\"function\":{\"name\":\"echo_marker\",\"arguments\":\"{\\\"value\\\":\"}}]}}]}\n\
997 data: {\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"harn_tool_probe_marker\\\"}\"}}]}}]}\n\
998 data: [DONE]\n";
999 let response = aggregate_stream_text(raw, "local");
1000 let case = classify_tool_probe_response(
1001 ToolProbeMode::Streaming,
1002 &response,
1003 DEFAULT_TOOL_PROBE_MARKER,
1004 None,
1005 None,
1006 );
1007 assert!(case.ok, "{case:?}");
1008 assert_eq!(
1009 case.classification,
1010 ToolProbeClassification::StructuredNativeToolCall
1011 );
1012 }
1013
1014 #[test]
1015 fn report_satisfies_tool_probe_when_text_fallback_passes() {
1016 let report = classify_tool_conformance_fixture(
1017 "llamacpp",
1018 "qwen",
1019 ToolProbeMode::NonStreaming,
1020 DEFAULT_TOOL_PROBE_MARKER,
1021 r#"{"content":"echo_marker({ value: \"harn_tool_probe_marker\" })"}"#,
1022 );
1023 assert!(report_satisfies_required_probe(&report, "tool_probe"));
1024 assert!(!report_satisfies_required_probe(
1025 &report,
1026 "native_tool_probe"
1027 ));
1028 }
1029
1030 #[test]
1031 fn summary_requires_every_repeated_native_case_to_pass() {
1032 let summary = summarize_cases(&[
1033 probe_case(
1034 ToolProbeMode::NonStreaming,
1035 true,
1036 ToolProbeClassification::StructuredNativeToolCall,
1037 ),
1038 probe_case(
1039 ToolProbeMode::NonStreaming,
1040 false,
1041 ToolProbeClassification::ProseOnlyNonTool,
1042 ),
1043 ]);
1044 assert_eq!(summary.native, ToolProbeStatus::Fail);
1045 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1046 }
1047
1048 #[test]
1049 fn summary_requires_every_repeated_text_case_to_pass() {
1050 let summary = summarize_cases(&[
1051 probe_case(
1052 ToolProbeMode::NonStreaming,
1053 true,
1054 ToolProbeClassification::ParseableHarnTextToolCall,
1055 ),
1056 probe_case(
1057 ToolProbeMode::NonStreaming,
1058 false,
1059 ToolProbeClassification::MalformedJsonArguments,
1060 ),
1061 ]);
1062 assert_eq!(summary.native, ToolProbeStatus::Fail);
1063 assert_eq!(summary.text, ToolProbeStatus::Fail);
1064 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Disabled);
1065 }
1066
1067 #[test]
1068 fn summary_preserves_nonstreaming_text_fallback_when_streaming_fails() {
1069 let summary = summarize_cases(&[
1070 probe_case(
1071 ToolProbeMode::NonStreaming,
1072 true,
1073 ToolProbeClassification::ParseableHarnTextToolCall,
1074 ),
1075 probe_case(
1076 ToolProbeMode::Streaming,
1077 false,
1078 ToolProbeClassification::ProseOnlyNonTool,
1079 ),
1080 ]);
1081 assert_eq!(summary.native, ToolProbeStatus::Fail);
1082 assert_eq!(summary.streaming_native, ToolProbeStatus::Fail);
1083 assert_eq!(summary.text, ToolProbeStatus::Pass);
1084 assert_eq!(summary.fallback_mode, ToolProbeFallbackMode::Text);
1085 }
1086
1087 fn probe_case(
1088 mode: ToolProbeMode,
1089 ok: bool,
1090 classification: ToolProbeClassification,
1091 ) -> ToolConformanceCase {
1092 let native_tool_call_count =
1093 usize::from(classification == ToolProbeClassification::StructuredNativeToolCall);
1094 let text_tool_call_count =
1095 usize::from(classification == ToolProbeClassification::ParseableHarnTextToolCall);
1096 ToolConformanceCase {
1097 mode,
1098 ok,
1099 classification,
1100 fallback_mode: ToolProbeFallbackMode::Disabled,
1101 failure_reason: None,
1102 http_status: None,
1103 elapsed_ms: None,
1104 native_tool_call_count,
1105 text_tool_call_count,
1106 parser_errors: Vec::new(),
1107 protocol_violations: Vec::new(),
1108 content_sample: None,
1109 }
1110 }
1111}