Skip to main content

zagens_core/engine/
tool_parser.rs

1//! Legacy parser for text-based tool calls from DeepSeek models.
2//!
3//! Structured tool-call items are preferred, so the engine no longer invokes
4//! this parser. It is kept for reference/debugging.
5//!
6//! Some DeepSeek outputs tool calls as text in various formats:
7//! ```text
8//! [TOOL_CALL]
9//! {tool => "tool_name", args => {...}}
10//! [/TOOL_CALL]
11//! ```
12//!
13//! Or XML-style format:
14//! ```text
15//! <deepseek:tool_call>
16//! <invoke name="tool_name">
17//! <parameter name="arg">value</parameter>
18//! </invoke>
19//! </deepseek:tool_call>
20//! ```
21//!
22//! This module parses these text patterns into structured tool calls.
23
24use regex::Regex;
25use serde_json::{Value, json};
26use std::sync::OnceLock;
27
28/// A parsed tool call from text content.
29#[derive(Debug, Clone)]
30pub struct ParsedToolCall {
31    /// Tool name
32    pub name: String,
33    /// Tool arguments as JSON
34    pub args: Value,
35    /// Generated ID for the tool call
36    pub id: String,
37}
38
39/// Result of parsing text for tool calls.
40#[derive(Debug)]
41pub struct ParseResult {
42    /// The text with tool call markers removed (for display)
43    pub clean_text: String,
44    /// Parsed tool calls found in the text
45    pub tool_calls: Vec<ParsedToolCall>,
46}
47
48static TOOL_CALL_REGEX: OnceLock<Regex> = OnceLock::new();
49static XML_TOOL_CALL_REGEX: OnceLock<Regex> = OnceLock::new();
50static INVOKE_REGEX: OnceLock<Regex> = OnceLock::new();
51static THINKING_REGEX: OnceLock<Regex> = OnceLock::new();
52
53fn get_tool_call_regex() -> &'static Regex {
54    TOOL_CALL_REGEX.get_or_init(|| {
55        // Match [TOOL_CALL] ... [/TOOL_CALL] blocks
56        Regex::new(r"(?s)\[TOOL_CALL\]\s*(.*?)\s*\[/TOOL_CALL\]")
57            .expect("TOOL_CALL regex pattern is valid")
58    })
59}
60
61fn get_xml_tool_call_regex() -> &'static Regex {
62    XML_TOOL_CALL_REGEX.get_or_init(|| {
63        // Match <deepseek:tool_call>...</deepseek:tool_call> or similar XML patterns
64        Regex::new(r"(?s)<(?:deepseek:)?tool_call[^>]*>\s*(.*?)\s*</(?:deepseek:)?tool_call>")
65            .expect("XML tool_call regex pattern is valid")
66    })
67}
68
69fn get_invoke_regex() -> &'static Regex {
70    INVOKE_REGEX.get_or_init(|| {
71        // Match <invoke name="tool_name">...</invoke> patterns
72        Regex::new(r#"(?s)<invoke\s+name\s*=\s*"([^"]+)"[^>]*>(.*?)</invoke>"#)
73            .expect("invoke regex pattern is valid")
74    })
75}
76
77fn get_thinking_regex() -> &'static Regex {
78    THINKING_REGEX.get_or_init(|| {
79        // Match thinking blocks including partial closing tags
80        Regex::new(r"(?s)</?(?:think|thinking)[^>]*>").expect("thinking regex pattern is valid")
81    })
82}
83
84/// Parse tool calls from text content.
85/// Returns the clean text (with markers removed) and any parsed tool calls.
86pub fn parse_tool_calls(text: &str) -> ParseResult {
87    let mut tool_calls = Vec::new();
88    let mut clean_text = text.to_string();
89    let mut id_counter = 0;
90
91    // First, remove thinking tags
92    let thinking_regex = get_thinking_regex();
93    clean_text = thinking_regex.replace_all(&clean_text, "").to_string();
94
95    // Parse [TOOL_CALL] format
96    let regex = get_tool_call_regex();
97    for cap in regex.captures_iter(text) {
98        let (Some(full_match), Some(inner)) = (cap.get(0), cap.get(1)) else {
99            continue;
100        };
101        let full_match = full_match.as_str();
102        let inner = inner.as_str().trim();
103
104        if let Some(parsed) = parse_tool_call_inner(inner, &mut id_counter) {
105            tool_calls.push(parsed);
106        }
107
108        clean_text = clean_text.replace(full_match, "");
109    }
110
111    // Parse XML-style <deepseek:tool_call> or <tool_call> format
112    let xml_regex = get_xml_tool_call_regex();
113    for cap in xml_regex.captures_iter(text) {
114        let (Some(full_match), Some(inner)) = (cap.get(0), cap.get(1)) else {
115            continue;
116        };
117        let full_match = full_match.as_str();
118        let inner = inner.as_str().trim();
119
120        // Parse invoke blocks inside
121        if let Some(parsed) = parse_invoke_block(inner, &mut id_counter) {
122            tool_calls.push(parsed);
123        } else if let Some(parsed) = parse_tool_call_inner(inner, &mut id_counter) {
124            tool_calls.push(parsed);
125        }
126
127        clean_text = clean_text.replace(full_match, "");
128    }
129
130    // Also parse standalone <invoke> blocks that might not be wrapped
131    let invoke_regex = get_invoke_regex();
132    for cap in invoke_regex.captures_iter(&clean_text.clone()) {
133        let (Some(full_match), Some(tool_name), Some(inner)) = (cap.get(0), cap.get(1), cap.get(2))
134        else {
135            continue;
136        };
137        let full_match = full_match.as_str();
138        let tool_name = tool_name.as_str();
139        let inner = inner.as_str();
140
141        let args = parse_xml_parameters(inner);
142        id_counter += 1;
143        tool_calls.push(ParsedToolCall {
144            name: tool_name.to_string(),
145            args,
146            id: format!("xml_tool_{id_counter}"),
147        });
148
149        clean_text = clean_text.replace(full_match, "");
150    }
151
152    // Clean up extra whitespace and empty lines
153    clean_text = clean_text
154        .lines()
155        .filter(|line| !line.trim().is_empty())
156        .collect::<Vec<_>>()
157        .join("\n")
158        .trim()
159        .to_string();
160
161    ParseResult {
162        clean_text,
163        tool_calls,
164    }
165}
166
167/// Parse an `<invoke>` block into a tool call.
168fn parse_invoke_block(content: &str, id_counter: &mut u32) -> Option<ParsedToolCall> {
169    let invoke_regex = get_invoke_regex();
170    let cap = invoke_regex.captures(content)?;
171
172    let tool_name = cap.get(1)?.as_str();
173    let inner = cap.get(2)?.as_str();
174
175    let args = parse_xml_parameters(inner);
176
177    *id_counter += 1;
178    Some(ParsedToolCall {
179        name: tool_name.to_string(),
180        args,
181        id: format!("xml_tool_{id_counter}"),
182    })
183}
184
185/// Parse XML-style parameters like <parameter name="foo">value</parameter>
186fn parse_xml_parameters(content: &str) -> Value {
187    let param_regex = Regex::new(
188        "<(?:parameter|param)\\s+name\\s*=\\s*\"([^\"]+)\"[^>]*>(.*?)</(?:parameter|param)>",
189    )
190    .ok();
191    let simple_tag_regex =
192        Regex::new("<([a-zA-Z_][a-zA-Z0-9_]*)>(.*?)</([a-zA-Z_][a-zA-Z0-9_]*)>").ok();
193
194    let mut map = serde_json::Map::new();
195
196    // Try parsing <parameter name="...">value</parameter>
197    if let Some(regex) = param_regex {
198        for cap in regex.captures_iter(content) {
199            if let (Some(name), Some(value)) = (cap.get(1), cap.get(2)) {
200                let name_str = name.as_str();
201                let value_str = value.as_str().trim();
202
203                // Try to parse as JSON, otherwise use as string
204                let json_value = serde_json::from_str(value_str)
205                    .unwrap_or_else(|_| Value::String(value_str.to_string()));
206                map.insert(name_str.to_string(), json_value);
207            }
208        }
209    }
210
211    // Also try parsing <tagname>value</tagname> format
212    if let Some(regex) = simple_tag_regex {
213        for cap in regex.captures_iter(content) {
214            if let (Some(name), Some(value), Some(close)) = (cap.get(1), cap.get(2), cap.get(3)) {
215                if name.as_str() != close.as_str() {
216                    continue;
217                }
218                let name_str = name.as_str();
219                // Skip known wrapper tags
220                if ["invoke", "tool_call", "parameter", "param"].contains(&name_str) {
221                    continue;
222                }
223                let value_str = value.as_str().trim();
224                if !map.contains_key(name_str) {
225                    let json_value = serde_json::from_str(value_str)
226                        .unwrap_or_else(|_| Value::String(value_str.to_string()));
227                    map.insert(name_str.to_string(), json_value);
228                }
229            }
230        }
231    }
232
233    Value::Object(map)
234}
235
236/// Parse the inner content of a `TOOL_CALL` block.
237fn parse_tool_call_inner(inner: &str, id_counter: &mut u32) -> Option<ParsedToolCall> {
238    // Try to parse as JSON first
239    if let Ok(json) = serde_json::from_str::<Value>(inner) {
240        return parse_from_json(&json, id_counter);
241    }
242
243    // Try the arrow syntax: {tool => "name", args => {...}}
244    if let Some(parsed) = parse_arrow_syntax(inner, id_counter) {
245        return Some(parsed);
246    }
247
248    // Try to extract tool name and args from any format
249    parse_flexible_format(inner, id_counter)
250}
251
252/// Parse from JSON object.
253fn parse_from_json(json: &Value, id_counter: &mut u32) -> Option<ParsedToolCall> {
254    let obj = json.as_object()?;
255
256    // Try different field names for the tool name
257    let name = obj
258        .get("tool")
259        .or_else(|| obj.get("name"))
260        .or_else(|| obj.get("function"))
261        .and_then(|v| v.as_str())?
262        .to_string();
263
264    // Try different field names for the arguments
265    let args = obj
266        .get("args")
267        .or_else(|| obj.get("arguments"))
268        .or_else(|| obj.get("input"))
269        .or_else(|| obj.get("parameters"))
270        .cloned()
271        .unwrap_or(json!({}));
272
273    *id_counter += 1;
274    Some(ParsedToolCall {
275        name,
276        args,
277        id: format!("text_tool_{id_counter}"),
278    })
279}
280
281/// Parse the arrow syntax: {tool => "name", args => {...}}
282fn parse_arrow_syntax(inner: &str, id_counter: &mut u32) -> Option<ParsedToolCall> {
283    // Extract tool name
284    let tool_regex = Regex::new(r#"tool\s*=>\s*"([^"]+)""#).ok()?;
285    let name = tool_regex.captures(inner)?.get(1)?.as_str().to_string();
286
287    // Extract args - try to find the JSON object after "args =>"
288    let args = if let Some(args_start) = inner.find("args =>") {
289        let args_str = inner[args_start + 7..].trim();
290        // Try to parse as JSON first
291        if let Ok(args_json) = serde_json::from_str::<Value>(args_str) {
292            args_json
293        } else if let Some(brace_start) = args_str.find('{') {
294            // Try to extract the content between braces
295            let mut brace_count = 0;
296            let mut end_idx = brace_start;
297            for (i, c) in args_str[brace_start..].chars().enumerate() {
298                match c {
299                    '{' => brace_count += 1,
300                    '}' => {
301                        brace_count -= 1;
302                        if brace_count == 0 {
303                            end_idx = brace_start + i + 1;
304                            break;
305                        }
306                    }
307                    _ => {}
308                }
309            }
310            let content = &args_str[brace_start + 1..end_idx - 1];
311
312            // Try to parse as JSON
313            if let Ok(json) = serde_json::from_str::<Value>(&format!("{{{content}}}")) {
314                json
315            } else {
316                // Try CLI-style args: --arg_name "value" or --arg_name value
317                parse_cli_style_args(content)
318            }
319        } else {
320            json!({})
321        }
322    } else {
323        json!({})
324    };
325
326    *id_counter += 1;
327    Some(ParsedToolCall {
328        name,
329        args,
330        id: format!("text_tool_{id_counter}"),
331    })
332}
333
334/// Parse CLI-style arguments: --`arg_name` "value" or --`arg_name` value
335fn parse_cli_style_args(content: &str) -> Value {
336    let mut map = serde_json::Map::new();
337
338    // Pattern: --arg_name "value" or --arg_name 'value' or --arg_name value
339    let arg_regex =
340        Regex::new(r#"--([a-zA-Z_][a-zA-Z0-9_]*)\s+(?:"([^"]*)"|'([^']*)'|(\S+))"#).ok();
341
342    if let Some(regex) = arg_regex {
343        for cap in regex.captures_iter(content) {
344            if let Some(arg_name) = cap.get(1) {
345                let arg_name = arg_name.as_str();
346                // Get the value from whichever capture group matched
347                let value = cap
348                    .get(2)
349                    .or_else(|| cap.get(3))
350                    .or_else(|| cap.get(4))
351                    .map_or("", |m| m.as_str());
352
353                // Try to parse as JSON value, otherwise use as string
354                let json_value = serde_json::from_str(value)
355                    .unwrap_or_else(|_| Value::String(value.to_string()));
356                map.insert(arg_name.to_string(), json_value);
357            }
358        }
359    }
360
361    // Also try simple key=value format
362    let kv_regex =
363        Regex::new(r#"([a-zA-Z_][a-zA-Z0-9_]*)\s*[:=]\s*(?:"([^"]*)"|'([^']*)'|(\S+))"#).ok();
364    if let Some(regex) = kv_regex {
365        for cap in regex.captures_iter(content) {
366            if let Some(key) = cap.get(1) {
367                let key = key.as_str();
368                if !map.contains_key(key) {
369                    let value = cap
370                        .get(2)
371                        .or_else(|| cap.get(3))
372                        .or_else(|| cap.get(4))
373                        .map_or("", |m| m.as_str());
374                    let json_value = serde_json::from_str(value)
375                        .unwrap_or_else(|_| Value::String(value.to_string()));
376                    map.insert(key.to_string(), json_value);
377                }
378            }
379        }
380    }
381
382    Value::Object(map)
383}
384
385/// Try to parse a flexible format.
386fn parse_flexible_format(inner: &str, id_counter: &mut u32) -> Option<ParsedToolCall> {
387    // Look for common patterns like:
388    // tool: list_dir
389    // name: "list_dir"
390    // function: list_dir
391
392    let patterns = [(
393        r#"(?:tool|name|function)\s*[:=]\s*"?([a-zA-Z_][a-zA-Z0-9_]*)"?"#,
394        1,
395    )];
396
397    for (pattern, group) in patterns {
398        if let Ok(regex) = Regex::new(pattern)
399            && let Some(cap) = regex.captures(inner)
400            && let Some(name_match) = cap.get(group)
401        {
402            let name = name_match.as_str().to_string();
403
404            // Try to extract args/input as JSON
405            let args = extract_json_object(inner).unwrap_or(json!({}));
406
407            *id_counter += 1;
408            return Some(ParsedToolCall {
409                name,
410                args,
411                id: format!("text_tool_{id_counter}"),
412            });
413        }
414    }
415
416    None
417}
418
419/// Extract the first JSON object from a string.
420fn extract_json_object(text: &str) -> Option<Value> {
421    let start = text.find('{')?;
422    let mut brace_count = 0;
423    let mut end_idx = start;
424
425    for (i, c) in text[start..].chars().enumerate() {
426        match c {
427            '{' => brace_count += 1,
428            '}' => {
429                brace_count -= 1;
430                if brace_count == 0 {
431                    end_idx = start + i + 1;
432                    break;
433                }
434            }
435            _ => {}
436        }
437    }
438
439    let json_str = &text[start..end_idx];
440    serde_json::from_str(json_str).ok()
441}
442
443/// Check if text contains tool call markers (either format).
444pub fn has_tool_call_markers(text: &str) -> bool {
445    text.contains("[TOOL_CALL]")
446        || text.contains("<deepseek:tool_call")
447        || text.contains("<tool_call")
448        || text.contains("<invoke ")
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454
455    #[test]
456    fn test_parse_arrow_syntax() {
457        let text = r#"I'll list the directory.
458[TOOL_CALL]
459{tool => "list_dir", args => {}}
460[/TOOL_CALL]"#;
461
462        let result = parse_tool_calls(text);
463        assert_eq!(result.tool_calls.len(), 1);
464        assert_eq!(result.tool_calls[0].name, "list_dir");
465        assert_eq!(result.clean_text, "I'll list the directory.");
466    }
467
468    #[test]
469    fn test_parse_json_syntax() {
470        let text = r#"Let me check.
471[TOOL_CALL]
472{"tool": "read_file", "args": {"path": "test.txt"}}
473[/TOOL_CALL]"#;
474
475        let result = parse_tool_calls(text);
476        assert_eq!(result.tool_calls.len(), 1);
477        assert_eq!(result.tool_calls[0].name, "read_file");
478        assert_eq!(result.tool_calls[0].args["path"], "test.txt");
479    }
480
481    #[test]
482    fn test_parse_multiple_tool_calls() {
483        let text = r#"First I'll list, then read.
484[TOOL_CALL]
485{tool => "list_dir", args => {}}
486[/TOOL_CALL]
487[TOOL_CALL]
488{tool => "read_file", args => {"path": "file.txt"}}
489[/TOOL_CALL]"#;
490
491        let result = parse_tool_calls(text);
492        assert_eq!(result.tool_calls.len(), 2);
493        assert_eq!(result.tool_calls[0].name, "list_dir");
494        assert_eq!(result.tool_calls[1].name, "read_file");
495    }
496
497    #[test]
498    fn test_no_tool_calls() {
499        let text = "Just some regular text without any tool calls.";
500        let result = parse_tool_calls(text);
501        assert!(result.tool_calls.is_empty());
502        assert_eq!(result.clean_text, text);
503    }
504
505    #[test]
506    fn test_has_markers() {
507        assert!(has_tool_call_markers("[TOOL_CALL]test[/TOOL_CALL]"));
508        assert!(!has_tool_call_markers("no markers here"));
509    }
510}