sparrow/provider/
tool_markup.rs1use regex::Regex;
23use serde_json::{Map, Value};
24
25const DSML_TOKEN: &str = "\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}";
28
29#[derive(Debug, Clone, PartialEq)]
30pub struct ParsedToolCall {
31 pub name: String,
32 pub args: Value,
33}
34
35pub fn looks_like_tool_markup(text: &str) -> bool {
44 let stripped = strip_dsml(text);
45 let invoke_closed = (stripped.contains("<invoke ") || stripped.contains("<invoke\t"))
47 && stripped.contains("name=")
48 && stripped.contains("</invoke>");
49 let trimmed = stripped.trim();
50 let fenced_json = trimmed.starts_with("```json")
51 && trimmed.ends_with("```")
52 && (stripped.contains("\"arguments\"") || stripped.contains("\"args\""));
53 let bracketed_tool = stripped.contains("[TOOL_CALL]") && stripped.contains("[/TOOL_CALL]");
54 let deepseek_tool =
55 stripped.contains("<|tool▁call▁begin|>") && stripped.contains("<|tool▁call▁end|>");
56 invoke_closed || fenced_json || bracketed_tool || deepseek_tool
57}
58
59pub fn could_be_tool_markup_prefix(text: &str) -> bool {
64 let stripped = strip_dsml(text);
65 let trimmed = stripped.trim_start();
66 if trimmed.is_empty() {
67 return true;
68 }
69 const STARTS: &[&str] = &[
70 "<invoke",
71 "[TOOL_CALL]",
72 "```json",
73 "<|tool▁call",
74 "<|tool▁calls",
75 ];
76 STARTS
77 .iter()
78 .any(|start| start.starts_with(trimmed) || trimmed.starts_with(start))
79}
80
81fn strip_dsml(text: &str) -> String {
82 text.replace(DSML_TOKEN, "")
86}
87
88fn coerce(raw: &str, declared_string: bool) -> Value {
96 if declared_string {
97 return Value::String(raw.to_string());
98 }
99 let t = raw.trim();
100 if t == "true" {
101 return Value::Bool(true);
102 }
103 if t == "false" {
104 return Value::Bool(false);
105 }
106 if let Ok(i) = t.parse::<i64>() {
107 return Value::from(i);
108 }
109 if let Ok(f) = t.parse::<f64>() {
110 if t.contains('.') {
112 return Value::from(f);
113 }
114 }
115 Value::String(raw.to_string())
117}
118
119pub fn extract_tool_calls(text: &str) -> Vec<ParsedToolCall> {
125 for parser in [
126 parse_invoke_tool_calls as fn(&str) -> Vec<ParsedToolCall>,
127 parse_tool_call_blocks,
128 parse_json_fences,
129 parse_deepseek_tool_calls,
130 ] {
131 let calls = parser(text);
132 if !calls.is_empty() {
133 return calls;
134 }
135 }
136 Vec::new()
137}
138
139fn parse_invoke_tool_calls(text: &str) -> Vec<ParsedToolCall> {
140 let cleaned = strip_dsml(text);
141 let invoke_re = Regex::new(r#"(?s)<invoke\s+name="([^"]+)"\s*>(.*?)</invoke>"#)
142 .expect("static invoke regex");
143 let param_re = Regex::new(r#"(?s)<parameter\s+name="([^"]+)"([^>]*)>(.*?)</parameter>"#)
145 .expect("static parameter regex");
146
147 let mut calls = Vec::new();
148 for inv in invoke_re.captures_iter(&cleaned) {
149 let name = inv[1].trim().to_string();
150 let body = &inv[2];
151 let mut args = Map::new();
152 for p in param_re.captures_iter(body) {
153 let pname = p[1].trim().to_string();
154 let declared_string = p[2].contains("string=\"true\"");
155 let pval = coerce(&p[3], declared_string);
156 args.insert(pname, pval);
157 }
158 if !name.is_empty() {
159 calls.push(ParsedToolCall {
160 name,
161 args: Value::Object(args),
162 });
163 }
164 }
165 calls
166}
167
168fn parse_tool_call_blocks(text: &str) -> Vec<ParsedToolCall> {
169 let block_re = Regex::new(r#"(?s)\[TOOL_CALL\](.*?)\[/TOOL_CALL\]"#)
170 .expect("static tool-call block regex");
171 block_re
172 .captures_iter(text)
173 .filter_map(|cap| serde_json::from_str::<Value>(cap[1].trim()).ok())
174 .filter_map(call_from_json)
175 .collect()
176}
177
178fn parse_json_fences(text: &str) -> Vec<ParsedToolCall> {
179 let trimmed = text.trim();
180 if !trimmed.starts_with("```json") || !trimmed.ends_with("```") {
181 return Vec::new();
182 }
183 let fence_re = Regex::new(r#"(?s)```json\s*(.*?)\s*```"#).expect("static json fence regex");
184 fence_re
185 .captures_iter(text)
186 .filter_map(|cap| serde_json::from_str::<Value>(cap[1].trim()).ok())
187 .flat_map(calls_from_json_value)
188 .collect()
189}
190
191fn parse_deepseek_tool_calls(text: &str) -> Vec<ParsedToolCall> {
192 let call_re = Regex::new(r#"(?s)<|tool▁call▁begin|>(.*?)<|tool▁call▁end|>"#)
193 .expect("static deepseek tool-call regex");
194 call_re
195 .captures_iter(text)
196 .filter_map(|cap| {
197 let body = cap[1].trim();
198 let (maybe_name, json_text) = match body.split_once("<|tool▁sep|>") {
199 Some((name, json)) => (Some(name.trim()), json.trim()),
200 None => (None, body),
201 };
202 let value = serde_json::from_str::<Value>(json_text).ok()?;
203 if let Some(call) = call_from_json(value.clone()) {
204 return Some(call);
205 }
206 let name = maybe_name?.trim();
207 if name.is_empty() || name == "function" {
208 return None;
209 }
210 Some(ParsedToolCall {
211 name: name.to_string(),
212 args: value,
213 })
214 })
215 .collect()
216}
217
218fn calls_from_json_value(value: Value) -> Vec<ParsedToolCall> {
219 match value {
220 Value::Array(items) => items.into_iter().filter_map(call_from_json).collect(),
221 other => call_from_json(other).into_iter().collect(),
222 }
223}
224
225fn call_from_json(value: Value) -> Option<ParsedToolCall> {
226 let obj = value.as_object()?;
227
228 if let Some(function) = obj.get("function").and_then(Value::as_object) {
229 let name = function.get("name").and_then(Value::as_str)?;
230 let args = function
231 .get("arguments")
232 .cloned()
233 .or_else(|| obj.get("arguments").cloned())
234 .unwrap_or_else(|| Value::Object(Map::new()));
235 return Some(ParsedToolCall {
236 name: name.to_string(),
237 args: normalize_args(args),
238 });
239 }
240
241 let name = obj
242 .get("name")
243 .or_else(|| obj.get("tool"))
244 .or_else(|| obj.get("tool_name"))
245 .and_then(Value::as_str)?;
246 let args = obj
247 .get("arguments")
248 .or_else(|| obj.get("args"))
249 .or_else(|| obj.get("input"))
250 .cloned()
251 .unwrap_or_else(|| Value::Object(Map::new()));
252 Some(ParsedToolCall {
253 name: name.to_string(),
254 args: normalize_args(args),
255 })
256}
257
258fn normalize_args(args: Value) -> Value {
259 match args {
260 Value::String(s) => serde_json::from_str::<Value>(&s).unwrap_or(Value::String(s)),
261 other => other,
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 const SAMPLE: &str = "<\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}tool_calls>\n<\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}invoke name=\"read\">\n<\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}parameter name=\"file_path\" string=\"true\">config.py</\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}parameter>\n</\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}invoke>\n</\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}tool_calls>";
270
271 #[test]
272 fn detects_dsml_markup() {
273 assert!(looks_like_tool_markup(SAMPLE));
274 assert!(!looks_like_tool_markup(
275 "just a normal answer about config.py"
276 ));
277 }
278
279 #[test]
280 fn parses_dsml_single_tool() {
281 let calls = extract_tool_calls(SAMPLE);
282 assert_eq!(calls.len(), 1);
283 assert_eq!(calls[0].name, "read");
284 assert_eq!(calls[0].args["file_path"], "config.py");
285 }
286
287 #[test]
288 fn parses_anthropic_style_without_dsml() {
289 let text = r#"<invoke name="fs_write">
290<parameter name="path">reverse.py</parameter>
291<parameter name="content">def f(): pass</parameter>
292</invoke>"#;
293 let calls = extract_tool_calls(text);
294 assert_eq!(calls.len(), 1);
295 assert_eq!(calls[0].name, "fs_write");
296 assert_eq!(calls[0].args["path"], "reverse.py");
297 assert_eq!(calls[0].args["content"], "def f(): pass");
298 }
299
300 #[test]
301 fn parses_multiple_invokes() {
302 let text = r#"<invoke name="a"><parameter name="x">1</parameter></invoke>
303<invoke name="b"><parameter name="y">two</parameter></invoke>"#;
304 let calls = extract_tool_calls(text);
305 assert_eq!(calls.len(), 2);
306 assert_eq!(calls[0].name, "a");
307 assert_eq!(calls[0].args["x"], 1);
308 assert_eq!(calls[1].name, "b");
309 assert_eq!(calls[1].args["y"], "two");
310 }
311
312 #[test]
313 fn ignores_plain_text() {
314 assert!(extract_tool_calls("no tools here, just prose").is_empty());
315 }
316
317 #[test]
318 fn i4_parses_json_tool_call_fence() {
319 let text = r#"```json
320{"name":"fs_write","arguments":{"path":"poeme.txt","content":"salut"}}
321```"#;
322 assert!(looks_like_tool_markup(text));
323 let calls = extract_tool_calls(text);
324 assert_eq!(calls.len(), 1);
325 assert_eq!(calls[0].name, "fs_write");
326 assert_eq!(calls[0].args["path"], "poeme.txt");
327 assert_eq!(calls[0].args["content"], "salut");
328 }
329
330 #[test]
331 fn i4_does_not_parse_embedded_json_example_as_tool_call() {
332 let text = r#"Here is the format:
333```json
334{"name":"read","arguments":{"file_path":"config.py"}}
335```
336Use it carefully."#;
337 assert!(!looks_like_tool_markup(text));
338 assert!(extract_tool_calls(text).is_empty());
339 }
340
341 #[test]
342 fn i4_parses_bracketed_tool_call() {
343 let text =
344 r#"[TOOL_CALL]{"name":"read","arguments":{"file_path":"config.py"}}[/TOOL_CALL]"#;
345 assert!(looks_like_tool_markup(text));
346 let calls = extract_tool_calls(text);
347 assert_eq!(calls.len(), 1);
348 assert_eq!(calls[0].name, "read");
349 assert_eq!(calls[0].args["file_path"], "config.py");
350 }
351
352 #[test]
353 fn i4_parses_deepseek_native_tool_call_json() {
354 let text = r#"<|tool▁calls▁begin|><|tool▁call▁begin|>{"name":"read","arguments":{"file_path":"src/main.rs"}}<|tool▁call▁end|><|tool▁calls▁end|>"#;
355 assert!(looks_like_tool_markup(text));
356 let calls = extract_tool_calls(text);
357 assert_eq!(calls.len(), 1);
358 assert_eq!(calls[0].name, "read");
359 assert_eq!(calls[0].args["file_path"], "src/main.rs");
360 }
361
362 #[test]
363 fn i4_parses_deepseek_native_tool_call_with_separator() {
364 let text = r#"<|tool▁call▁begin|>fs_write<|tool▁sep|>{"path":"a.txt","content":"ok"}<|tool▁call▁end|>"#;
365 let calls = extract_tool_calls(text);
366 assert_eq!(calls.len(), 1);
367 assert_eq!(calls[0].name, "fs_write");
368 assert_eq!(calls[0].args["path"], "a.txt");
369 }
370
371 #[test]
372 fn i4_parses_openai_function_shape_with_string_arguments() {
373 let text = r#"```json
374{"function":{"name":"read","arguments":"{\"file_path\":\"Cargo.toml\"}"}}
375```"#;
376 let calls = extract_tool_calls(text);
377 assert_eq!(calls.len(), 1);
378 assert_eq!(calls[0].name, "read");
379 assert_eq!(calls[0].args["file_path"], "Cargo.toml");
380 }
381
382 #[test]
383 fn b1_detects_partial_tool_markup_prefixes() {
384 assert!(could_be_tool_markup_prefix("<"));
385 assert!(could_be_tool_markup_prefix("<invoke name=\"read\""));
386 assert!(could_be_tool_markup_prefix("[TOOL"));
387 assert!(could_be_tool_markup_prefix("```json\n{\"name\""));
388 assert!(could_be_tool_markup_prefix("<|tool▁call▁begin|>"));
389 assert!(!could_be_tool_markup_prefix(
390 "Bonjour <invoke name=\"read\">"
391 ));
392 assert!(!could_be_tool_markup_prefix("plain text"));
393 }
394
395 #[test]
396 fn b3_prose_mentioning_invoke_is_not_treated_as_markup() {
397 let prose = r#"To call a tool, the model emits `<invoke name="read">` —
400note there is no closing tag in this explanation."#;
401 assert!(!looks_like_tool_markup(prose));
402 }
403
404 #[test]
405 fn b3_complete_block_is_detected() {
406 let t = r#"<invoke name="read"><parameter name="p">x</parameter></invoke>"#;
407 assert!(looks_like_tool_markup(t));
408 }
409
410 #[test]
411 fn b2_declared_string_is_not_coerced() {
412 let t = "<\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}invoke name=\"x\">\n<\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}parameter name=\"n\" string=\"true\">123</\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}parameter>\n</\u{FF5C}\u{FF5C}DSML\u{FF5C}\u{FF5C}invoke>";
414 let calls = extract_tool_calls(t);
415 assert_eq!(calls[0].args["n"], Value::String("123".into()));
416 }
417
418 #[test]
419 fn b2_file_content_whitespace_is_preserved() {
420 let t = "<invoke name=\"fs_write\"><parameter name=\"content\">\nline1\nline2\n</parameter></invoke>";
422 let calls = extract_tool_calls(t);
423 assert_eq!(
424 calls[0].args["content"],
425 Value::String("\nline1\nline2\n".into())
426 );
427 }
428}