rig_compose/
normalizer.rs

1//! [`ToolCallNormalizer`] — converts raw LLM text output into structured
2//! [`ToolInvocation`]s.
3//!
4//! Models served via OpenAI-compatible APIs (e.g. `mlx_lm.server`) sometimes
5//! emit tool-intent as in-band text markers rather than the structured
6//! `tool_calls` JSON field. Normalizers detect and decode those markers so the
7//! kernel can dispatch them identically to first-class tool calls.
8//!
9//! # Built-in implementations
10//!
11//! | Type | Format |
12//! |------|--------|
13//! | [`LfmNormalizer`] | LiquidAI LFM `<\|tool_call_start\|>[fn(k=v)]<\|tool_call_end\|>` |
14//! | [`StructuredToolCallNormalizer`] | OpenAI Responses `function_call` output and Chat Completions `tool_calls` |
15//!
16//! # Example
17//!
18//! ```no_run
19//! use rig_compose::normalizer::{LfmNormalizer, ToolCallNormalizer};
20//!
21//! let raw = "<|tool_call_start|>[get_weather(city='Berlin')]<|tool_call_end|>";
22//! let normalizer = LfmNormalizer;
23//! let calls = normalizer.normalize(raw).expect("parse ok");
24//! assert_eq!(calls[0].name, "get_weather");
25//! ```
26
27use async_trait::async_trait;
28use serde_json::{Map, Value};
29
30use crate::registry::KernelError;
31use crate::registry::ToolRegistry;
32use crate::tool::ToolName;
33
34// ── Public types ─────────────────────────────────────────────────────────────
35
36/// A structured tool invocation extracted from raw model output.
37#[derive(Debug, Clone, PartialEq)]
38pub struct ToolInvocation {
39    /// Registry name of the tool to invoke (e.g. `"get_weather"`).
40    pub name: ToolName,
41    /// JSON object of arguments to pass to the tool.
42    pub args: Value,
43}
44
45impl ToolInvocation {
46    /// Build a validated [`ToolInvocation`] from a tool name and JSON args.
47    pub fn new(name: impl Into<ToolName>, args: Value) -> Result<Self, KernelError> {
48        let name = name.into();
49        if name.trim().is_empty() {
50            return Err(KernelError::NormalizerFailed(
51                "empty tool name in structured tool call".into(),
52            ));
53        }
54        validate_identifier("tool name", &name)?;
55        Ok(Self { name, args })
56    }
57
58    /// Dispatch this invocation through a [`ToolRegistry`].
59    pub async fn dispatch(&self, tools: &ToolRegistry) -> Result<Value, KernelError> {
60        tools.invoke(&self.name, self.args.clone()).await
61    }
62}
63
64/// Result of dispatching one normalized [`ToolInvocation`].
65#[derive(Debug, Clone, PartialEq)]
66pub struct ToolInvocationResult {
67    /// The normalized invocation that was dispatched.
68    pub invocation: ToolInvocation,
69    /// The JSON result returned by the invoked tool.
70    pub output: Value,
71}
72
73/// Decision returned by a [`ToolDispatchHook`] before a tool invocation runs.
74#[derive(Debug, Clone, PartialEq)]
75pub enum ToolDispatchAction {
76    /// Invoke the tool normally.
77    Continue,
78    /// Do not invoke the tool; record `output` as the invocation result.
79    Skip { output: Value },
80    /// Stop dispatching and return [`KernelError::ToolDispatchTerminated`].
81    Terminate { reason: String },
82}
83
84/// Hook for policy, accounting, and tracing around normalized tool dispatch.
85///
86/// Hooks are intentionally provider-neutral: they see only the normalized
87/// [`ToolInvocation`] and the resulting [`ToolInvocationResult`]. Concrete
88/// policy engines, approval systems, and trace exporters should live in
89/// downstream crates and plug into this small kernel surface.
90#[async_trait]
91pub trait ToolDispatchHook: Send + Sync {
92    /// Called before each invocation. Return [`ToolDispatchAction::Continue`]
93    /// to invoke the tool, [`ToolDispatchAction::Skip`] to synthesize a result,
94    /// or [`ToolDispatchAction::Terminate`] to stop the dispatch loop.
95    async fn before_invocation(
96        &self,
97        _invocation: &ToolInvocation,
98    ) -> Result<ToolDispatchAction, KernelError> {
99        Ok(ToolDispatchAction::Continue)
100    }
101
102    /// Called after a tool invocation or hook-provided skip result is recorded.
103    async fn after_invocation(&self, _result: &ToolInvocationResult) -> Result<(), KernelError> {
104        Ok(())
105    }
106
107    /// Called when dispatch stops after this hook may have observed the
108    /// invocation in [`Self::before_invocation`]. Hooks that reserve resources
109    /// before dispatch should release them here.
110    async fn on_invocation_error(
111        &self,
112        _invocation: &ToolInvocation,
113        _error: &KernelError,
114    ) -> Result<(), KernelError> {
115        Ok(())
116    }
117}
118
119/// Dispatch normalized tool invocations sequentially through a [`ToolRegistry`].
120///
121/// Sequential dispatch preserves model-emitted call order and avoids adding a
122/// runtime-specific concurrency policy to the kernel. Callers that know their
123/// tools are independent can still dispatch invocations concurrently by using
124/// [`ToolInvocation::dispatch`] directly.
125pub async fn dispatch_tool_invocations(
126    tools: &ToolRegistry,
127    invocations: &[ToolInvocation],
128) -> Result<Vec<ToolInvocationResult>, KernelError> {
129    dispatch_tool_invocations_with_hooks(tools, invocations, &[]).await
130}
131
132/// Dispatch normalized tool invocations with policy/accounting hooks.
133///
134/// Hooks run in the order provided. A skip result still triggers every hook's
135/// [`ToolDispatchHook::after_invocation`] callback so audit hooks can record
136/// synthetic outcomes. A terminate action stops dispatch before the tool is
137/// invoked and returns [`KernelError::ToolDispatchTerminated`].
138pub async fn dispatch_tool_invocations_with_hooks(
139    tools: &ToolRegistry,
140    invocations: &[ToolInvocation],
141    hooks: &[&dyn ToolDispatchHook],
142) -> Result<Vec<ToolInvocationResult>, KernelError> {
143    let mut results = Vec::with_capacity(invocations.len());
144
145    for invocation in invocations {
146        let mut action = ToolDispatchAction::Continue;
147        // Track how many hooks observed `before_invocation` so that, on a
148        // hook error, we can notify exactly those hooks via
149        // `on_invocation_error`. Without this, a hook that reserved a
150        // resource in `before_invocation` (e.g. `DispatchBudgetHook`)
151        // would leak that reservation when a later hook errors.
152        let mut observed: usize = 0;
153        let mut before_err: Option<KernelError> = None;
154        for hook in hooks {
155            match hook.before_invocation(invocation).await {
156                Ok(next) => {
157                    observed += 1;
158                    action = next;
159                    if !matches!(action, ToolDispatchAction::Continue) {
160                        break;
161                    }
162                }
163                Err(error) => {
164                    before_err = Some(error);
165                    break;
166                }
167            }
168        }
169        if let Some(error) = before_err {
170            notify_invocation_error_subset(hooks, observed, invocation, &error).await?;
171            return Err(error);
172        }
173
174        let output = match action {
175            ToolDispatchAction::Continue => match invocation.dispatch(tools).await {
176                Ok(output) => output,
177                Err(error) => {
178                    notify_invocation_error(hooks, invocation, &error).await?;
179                    return Err(error);
180                }
181            },
182            ToolDispatchAction::Skip { output } => output,
183            ToolDispatchAction::Terminate { reason } => {
184                let error = KernelError::ToolDispatchTerminated(reason);
185                notify_invocation_error(hooks, invocation, &error).await?;
186                return Err(error);
187            }
188        };
189
190        let result = ToolInvocationResult {
191            invocation: invocation.clone(),
192            output,
193        };
194
195        for hook in hooks {
196            hook.after_invocation(&result).await?;
197        }
198
199        results.push(result);
200    }
201
202    Ok(results)
203}
204
205async fn notify_invocation_error(
206    hooks: &[&dyn ToolDispatchHook],
207    invocation: &ToolInvocation,
208    error: &KernelError,
209) -> Result<(), KernelError> {
210    for hook in hooks {
211        hook.on_invocation_error(invocation, error).await?;
212    }
213    Ok(())
214}
215
216/// Notify the first `upto` hooks that observed `before_invocation` so they
217/// can release any resources reserved there. Used when a later hook's
218/// `before_invocation` returns an error and we must unwind partial state.
219async fn notify_invocation_error_subset(
220    hooks: &[&dyn ToolDispatchHook],
221    upto: usize,
222    invocation: &ToolInvocation,
223    error: &KernelError,
224) -> Result<(), KernelError> {
225    for hook in hooks.iter().take(upto) {
226        hook.on_invocation_error(invocation, error).await?;
227    }
228    Ok(())
229}
230
231/// Normalizes raw LLM text output into structured [`ToolInvocation`]s.
232///
233/// Implement this trait to support additional model families that emit tool
234/// intent as in-band text markers. The trait is object-safe so normalizers can
235/// be stored as `Arc<dyn ToolCallNormalizer>` alongside other kernel objects.
236///
237/// # Contract
238///
239/// - [`normalize`](ToolCallNormalizer::normalize) returns an empty `Vec` when
240///   `raw` contains no markers this normalizer recognises. An empty result is
241///   never an error.
242/// - [`is_applicable`](ToolCallNormalizer::is_applicable) must return `true`
243///   whenever `normalize` would return a non-empty `Vec`. It is a cheap guard
244///   to short-circuit expensive parsing in pipelines.
245pub trait ToolCallNormalizer: Send + Sync {
246    /// Parse `raw` text into zero or more tool invocations.
247    fn normalize(&self, raw: &str) -> Result<Vec<ToolInvocation>, KernelError>;
248
249    /// Quick scan: does `raw` contain markers this normalizer handles?
250    fn is_applicable(&self, raw: &str) -> bool;
251}
252
253// ── Structured standards normalizer ──────────────────────────────────────────
254
255/// Normalizer for structured tool-call JSON returned by common provider APIs.
256///
257/// This type keeps the kernel independent from provider-specific Rust types by
258/// operating on `serde_json::Value` shapes. It supports:
259///
260/// - OpenAI Responses API output items: `{"type":"function_call", ...}`
261/// - OpenAI Responses API full responses: `{ "output": [function_call, ...] }`
262/// - OpenAI Chat Completions tool calls: `{ "tool_calls": [...] }`
263/// - OpenAI Chat Completions full responses: `{ "choices": [{ "message": ... }] }`
264#[derive(Debug, Clone, Default)]
265pub struct StructuredToolCallNormalizer;
266
267impl StructuredToolCallNormalizer {
268    /// Parse OpenAI Responses API `function_call` output items from either a
269    /// full response object, an `output` array, or a single output item.
270    pub fn normalize_openai_responses(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
271        match value {
272            Value::Object(object) => {
273                if let Some(output) = object.get("output") {
274                    return normalize_responses_output(output);
275                }
276                if is_responses_function_call(object) {
277                    return parse_responses_function_call(object).map(|call| vec![call]);
278                }
279                Ok(Vec::new())
280            }
281            Value::Array(items) => items
282                .iter()
283                .map(normalize_responses_output_item)
284                .collect::<Result<Vec<_>, _>>()
285                .map(flatten_invocations),
286            _ => Ok(Vec::new()),
287        }
288    }
289
290    /// Parse OpenAI Chat Completions `tool_calls` from either a full response,
291    /// a message object, a `tool_calls` array, or a single tool call object.
292    pub fn normalize_openai_chat_completions(
293        value: &Value,
294    ) -> Result<Vec<ToolInvocation>, KernelError> {
295        match value {
296            Value::Object(object) => {
297                if let Some(choices) = object.get("choices") {
298                    return normalize_chat_choices(choices);
299                }
300                if let Some(tool_calls) = object.get("tool_calls") {
301                    return normalize_chat_tool_calls(tool_calls);
302                }
303                if is_chat_tool_call(object) {
304                    return parse_chat_tool_call(object).map(|call| vec![call]);
305                }
306                Ok(Vec::new())
307            }
308            Value::Array(items) => normalize_chat_tool_calls_array(items),
309            _ => Ok(Vec::new()),
310        }
311    }
312
313    /// Parse all supported structured standards from `value`.
314    ///
315    /// This is useful when the caller has a provider JSON blob but does not
316    /// want to branch on the provider path first. It preserves the order of
317    /// calls within each standard and tries Responses before Chat Completions.
318    pub fn normalize(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
319        let mut invocations = Self::normalize_openai_responses(value)?;
320        invocations.extend(Self::normalize_openai_chat_completions(value)?);
321        Ok(invocations)
322    }
323}
324
325// ── LFM normalizer ────────────────────────────────────────────────────────────
326
327const LFM_START: &str = "<|tool_call_start|>";
328const LFM_END: &str = "<|tool_call_end|>";
329
330/// Normalizer for LiquidAI LFM models (e.g. `LFM2.5-1.2B-Thinking`) served
331/// through `mlx_lm.server` or similar OpenAI-compatible shims that emit tool
332/// intent as in-band text rather than the structured `tool_calls` field.
333///
334/// Recognised format:
335/// ```text
336/// <|tool_call_start|>[get_weather(city='Berlin')]<|tool_call_end|>
337/// ```
338///
339/// Multiple calls per block (`[fn1(a=1), fn2(b=2)]`) and multiple blocks per
340/// message are both handled.
341///
342/// # Example
343///
344/// ```no_run
345/// use rig_compose::normalizer::{LfmNormalizer, ToolCallNormalizer};
346/// use serde_json::json;
347///
348/// let raw = "<|tool_call_start|>[add(x=3, y=4)]<|tool_call_end|>";
349/// let calls = LfmNormalizer.normalize(raw).unwrap();
350/// assert_eq!(calls[0].name, "add");
351/// assert_eq!(calls[0].args, json!({"x": 3, "y": 4}));
352/// ```
353#[derive(Debug, Clone, Default)]
354pub struct LfmNormalizer;
355
356impl ToolCallNormalizer for LfmNormalizer {
357    fn is_applicable(&self, raw: &str) -> bool {
358        raw.contains(LFM_START)
359    }
360
361    fn normalize(&self, raw: &str) -> Result<Vec<ToolInvocation>, KernelError> {
362        let mut results = Vec::new();
363        let mut remaining = raw;
364
365        while let Some(block_start) = remaining.find(LFM_START) {
366            // Skip past the start marker.
367            let after_start = remaining
368                .get(block_start + LFM_START.len()..)
369                .ok_or_else(|| KernelError::NormalizerFailed("LFM: start marker overrun".into()))?;
370
371            let block_end = after_start.find(LFM_END).ok_or_else(|| {
372                KernelError::NormalizerFailed("LFM: unclosed <|tool_call_start|> marker".into())
373            })?;
374
375            let block = after_start.get(..block_end).ok_or_else(|| {
376                KernelError::NormalizerFailed("LFM: block slice out of bounds".into())
377            })?;
378
379            // Advance past the end marker; if nothing remains, stop.
380            remaining = after_start.get(block_end + LFM_END.len()..).unwrap_or("");
381
382            let calls = parse_lfm_block(block)?;
383            results.extend(calls);
384        }
385
386        Ok(results)
387    }
388}
389
390// ── Structured standards helpers ─────────────────────────────────────────────
391
392fn normalize_responses_output(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
393    match value {
394        Value::Array(items) => items
395            .iter()
396            .map(normalize_responses_output_item)
397            .collect::<Result<Vec<_>, _>>()
398            .map(flatten_invocations),
399        Value::Object(object) if is_responses_function_call(object) => {
400            parse_responses_function_call(object).map(|call| vec![call])
401        }
402        _ => Ok(Vec::new()),
403    }
404}
405
406fn normalize_responses_output_item(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
407    match value {
408        Value::Object(object) if is_responses_function_call(object) => {
409            parse_responses_function_call(object).map(|call| vec![call])
410        }
411        _ => Ok(Vec::new()),
412    }
413}
414
415fn is_responses_function_call(object: &Map<String, Value>) -> bool {
416    object
417        .get("type")
418        .and_then(Value::as_str)
419        .is_some_and(|kind| kind == "function_call")
420}
421
422fn parse_responses_function_call(
423    object: &Map<String, Value>,
424) -> Result<ToolInvocation, KernelError> {
425    let name = required_string_field(object, "name", "OpenAI Responses function_call")?;
426    let args = object
427        .get("arguments")
428        .map(parse_standard_arguments)
429        .transpose()?
430        .unwrap_or_else(|| Value::Object(Map::new()));
431    ToolInvocation::new(name, args)
432}
433
434fn normalize_chat_choices(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
435    let choices = value.as_array().ok_or_else(|| {
436        KernelError::NormalizerFailed("OpenAI Chat Completions choices must be an array".into())
437    })?;
438
439    let mut invocations = Vec::new();
440    for choice in choices {
441        let Some(message) = choice.get("message") else {
442            continue;
443        };
444        invocations
445            .extend(StructuredToolCallNormalizer::normalize_openai_chat_completions(message)?);
446    }
447
448    Ok(invocations)
449}
450
451fn normalize_chat_tool_calls(value: &Value) -> Result<Vec<ToolInvocation>, KernelError> {
452    match value {
453        Value::Array(items) => normalize_chat_tool_calls_array(items),
454        Value::Object(object) if is_chat_tool_call(object) => {
455            parse_chat_tool_call(object).map(|call| vec![call])
456        }
457        _ => Ok(Vec::new()),
458    }
459}
460
461fn normalize_chat_tool_calls_array(items: &[Value]) -> Result<Vec<ToolInvocation>, KernelError> {
462    items
463        .iter()
464        .map(|item| match item {
465            Value::Object(object) if is_chat_tool_call(object) => parse_chat_tool_call(object),
466            Value::Object(_) => Err(KernelError::NormalizerFailed(
467                "OpenAI Chat Completions tool call missing function payload".into(),
468            )),
469            _ => Err(KernelError::NormalizerFailed(
470                "OpenAI Chat Completions tool call must be an object".into(),
471            )),
472        })
473        .collect()
474}
475
476fn is_chat_tool_call(object: &Map<String, Value>) -> bool {
477    object.get("function").is_some()
478}
479
480fn parse_chat_tool_call(object: &Map<String, Value>) -> Result<ToolInvocation, KernelError> {
481    let function = object
482        .get("function")
483        .and_then(Value::as_object)
484        .ok_or_else(|| {
485            KernelError::NormalizerFailed(
486                "OpenAI Chat Completions tool call missing function object".into(),
487            )
488        })?;
489    let name = required_string_field(function, "name", "OpenAI Chat Completions function")?;
490    let args = function
491        .get("arguments")
492        .map(parse_standard_arguments)
493        .transpose()?
494        .unwrap_or_else(|| Value::Object(Map::new()));
495
496    ToolInvocation::new(name, args)
497}
498
499fn parse_standard_arguments(value: &Value) -> Result<Value, KernelError> {
500    match value {
501        Value::String(raw) => {
502            let trimmed = raw.trim();
503            if trimmed.is_empty() {
504                return Ok(Value::Object(Map::new()));
505            }
506            serde_json::from_str(trimmed).map_err(|err| {
507                KernelError::NormalizerFailed(format!(
508                    "failed to parse standard tool-call arguments JSON: {err}"
509                ))
510            })
511        }
512        Value::Null => Ok(Value::Object(Map::new())),
513        other => Ok(other.clone()),
514    }
515}
516
517fn required_string_field(
518    object: &Map<String, Value>,
519    field: &str,
520    context: &str,
521) -> Result<String, KernelError> {
522    object
523        .get(field)
524        .and_then(Value::as_str)
525        .map(ToOwned::to_owned)
526        .ok_or_else(|| KernelError::NormalizerFailed(format!("{context} missing `{field}` string")))
527}
528
529fn flatten_invocations(nested: Vec<Vec<ToolInvocation>>) -> Vec<ToolInvocation> {
530    nested.into_iter().flatten().collect()
531}
532
533// ── Parsing helpers ───────────────────────────────────────────────────────────
534
535/// Parse one `[fn1(a=1), fn2(b=2)]` block from an LFM marker.
536fn parse_lfm_block(block: &str) -> Result<Vec<ToolInvocation>, KernelError> {
537    let block = block.trim();
538    // Strip optional surrounding `[ ]`.
539    let inner = block
540        .strip_prefix('[')
541        .and_then(|s| s.strip_suffix(']'))
542        .unwrap_or(block);
543
544    split_top_level(inner, ',')
545        .into_iter()
546        .filter(|s| !s.trim().is_empty())
547        .map(|s| parse_lfm_call(s.trim()))
548        .collect()
549}
550
551/// Parse one `fn_name(k1=v1, k2=v2)` call expression.
552fn parse_lfm_call(expr: &str) -> Result<ToolInvocation, KernelError> {
553    let (name_raw, rest) = expr.split_once('(').ok_or_else(|| {
554        KernelError::NormalizerFailed(format!("LFM: expected '(' in call: {expr:?}"))
555    })?;
556
557    let name = name_raw.trim().to_string();
558    if name.is_empty() {
559        return Err(KernelError::NormalizerFailed(
560            "LFM: empty tool name in call expression".into(),
561        ));
562    }
563    validate_identifier("tool name", &name)?;
564
565    // Use rsplit_once to handle nested parentheses in argument values.
566    let (kwargs_str, trailing) = rest.rsplit_once(')').ok_or_else(|| {
567        KernelError::NormalizerFailed(format!("LFM: missing closing ')' in: {expr:?}"))
568    })?;
569    if !trailing.trim().is_empty() {
570        return Err(KernelError::NormalizerFailed(format!(
571            "LFM: trailing content after call expression: {trailing:?}"
572        )));
573    }
574
575    let args = parse_kwargs(kwargs_str)?;
576    Ok(ToolInvocation { name, args })
577}
578
579/// Parse a comma-separated `key=value` kwargs string into a JSON object.
580fn parse_kwargs(s: &str) -> Result<Value, KernelError> {
581    let s = s.trim();
582    if s.is_empty() {
583        return Ok(Value::Object(Map::new()));
584    }
585
586    let mut map = Map::new();
587    for pair in split_top_level(s, ',') {
588        let pair = pair.trim();
589        if pair.is_empty() {
590            continue;
591        }
592        let (key_raw, val_raw) = pair.split_once('=').ok_or_else(|| {
593            KernelError::NormalizerFailed(format!("LFM: kwarg without '=': {pair:?}"))
594        })?;
595        let key = key_raw.trim().to_string();
596        if key.is_empty() {
597            return Err(KernelError::NormalizerFailed(
598                "LFM: empty kwarg name".into(),
599            ));
600        }
601        validate_identifier("kwarg name", &key)?;
602        if map.contains_key(&key) {
603            return Err(KernelError::NormalizerFailed(format!(
604                "LFM: duplicate kwarg: {key}"
605            )));
606        }
607        let val = parse_value(val_raw.trim())?;
608        map.insert(key, val);
609    }
610
611    Ok(Value::Object(map))
612}
613
614/// Best-effort conversion of a Python literal token into a JSON [`Value`].
615///
616/// Supported: single/double-quoted strings, `True`/`False`, `None`/`null`,
617/// integers, floats, lists, and dict/object literals. Anything else is
618/// returned as an unquoted string.
619fn parse_value(s: &str) -> Result<Value, KernelError> {
620    let s = s.trim();
621
622    if s.is_empty() {
623        return Ok(Value::String(String::new()));
624    }
625
626    // Single-quoted string.
627    if let Some(inner) = s.strip_prefix('\'').and_then(|t| t.strip_suffix('\'')) {
628        return Ok(Value::String(
629            inner.replace("\\'", "'").replace("\\\"", "\""),
630        ));
631    }
632    if s.starts_with('\'') {
633        return Err(KernelError::NormalizerFailed(
634            "LFM: unterminated single-quoted string".into(),
635        ));
636    }
637    // Double-quoted string.
638    if let Some(inner) = s.strip_prefix('"').and_then(|t| t.strip_suffix('"')) {
639        return Ok(Value::String(
640            inner.replace("\\'", "'").replace("\\\"", "\""),
641        ));
642    }
643    if s.starts_with('"') {
644        return Err(KernelError::NormalizerFailed(
645            "LFM: unterminated double-quoted string".into(),
646        ));
647    }
648    // Python booleans.
649    if s == "True" {
650        return Ok(Value::Bool(true));
651    }
652    if s == "False" {
653        return Ok(Value::Bool(false));
654    }
655    // Null / None.
656    if s == "None" || s == "null" {
657        return Ok(Value::Null);
658    }
659    // List / array literal.
660    if let Some(inner) = s.strip_prefix('[').and_then(|t| t.strip_suffix(']')) {
661        return parse_array(inner);
662    }
663    if s.starts_with('[') {
664        return Err(KernelError::NormalizerFailed(
665            "LFM: unterminated list literal".into(),
666        ));
667    }
668    // Dict / object literal.
669    if let Some(inner) = s.strip_prefix('{').and_then(|t| t.strip_suffix('}')) {
670        return parse_object(inner);
671    }
672    if s.starts_with('{') {
673        return Err(KernelError::NormalizerFailed(
674            "LFM: unterminated object literal".into(),
675        ));
676    }
677    // Integer.
678    if let Ok(n) = s.parse::<i64>() {
679        return Ok(Value::Number(n.into()));
680    }
681    // Float.
682    if let Ok(f) = s.parse::<f64>() {
683        let num = serde_json::Number::from_f64(f).ok_or_else(|| {
684            KernelError::NormalizerFailed(format!("LFM: non-finite float in argument: {s:?}"))
685        })?;
686        return Ok(Value::Number(num));
687    }
688    // Fall back: treat as an unquoted string literal.
689    Ok(Value::String(s.to_string()))
690}
691
692fn parse_array(inner: &str) -> Result<Value, KernelError> {
693    let inner = inner.trim();
694    if inner.is_empty() {
695        return Ok(Value::Array(Vec::new()));
696    }
697
698    let values = split_top_level(inner, ',')
699        .into_iter()
700        .filter(|part| !part.trim().is_empty())
701        .map(|part| parse_value(part.trim()))
702        .collect::<Result<Vec<_>, _>>()?;
703
704    Ok(Value::Array(values))
705}
706
707fn parse_object(inner: &str) -> Result<Value, KernelError> {
708    let inner = inner.trim();
709    if inner.is_empty() {
710        return Ok(Value::Object(Map::new()));
711    }
712
713    let mut map = Map::new();
714    for entry in split_top_level(inner, ',') {
715        let entry = entry.trim();
716        if entry.is_empty() {
717            continue;
718        }
719
720        let (key_raw, value_raw) = split_once_top_level(entry, ':').ok_or_else(|| {
721            KernelError::NormalizerFailed(format!("LFM: object entry without ':': {entry:?}"))
722        })?;
723        let key = parse_object_key(key_raw.trim())?;
724        if map.contains_key(&key) {
725            return Err(KernelError::NormalizerFailed(format!(
726                "LFM: duplicate object key: {key}"
727            )));
728        }
729
730        map.insert(key, parse_value(value_raw.trim())?);
731    }
732
733    Ok(Value::Object(map))
734}
735
736fn parse_object_key(raw: &str) -> Result<String, KernelError> {
737    match parse_value(raw)? {
738        Value::String(key) => Ok(key),
739        _ => Err(KernelError::NormalizerFailed(format!(
740            "LFM: object key must be a string: {raw:?}"
741        ))),
742    }
743}
744
745/// Validate model-emitted identifiers before they reach dispatch. Tool names
746/// allow the same separator characters commonly used in registries, while
747/// keyword argument names stay simple and JSON-object friendly.
748fn validate_identifier(kind: &str, value: &str) -> Result<(), KernelError> {
749    let valid = value
750        .chars()
751        .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.'));
752
753    if valid {
754        return Ok(());
755    }
756
757    Err(KernelError::NormalizerFailed(format!(
758        "invalid {kind}: {value:?}"
759    )))
760}
761
762/// Split `s` at top-level occurrences of `delim` (i.e. not inside nested
763/// brackets, parentheses, braces, or single/double quotes). Returns the
764/// subslices between delimiters — including empty slices at the edges.
765fn split_top_level(s: &str, delim: char) -> Vec<&str> {
766    let mut parts: Vec<&str> = Vec::new();
767    let mut depth: usize = 0;
768    let mut in_sq = false;
769    let mut in_dq = false;
770    let mut escape_next = false;
771    let mut start = 0usize;
772
773    for (i, ch) in s.char_indices() {
774        if escape_next {
775            escape_next = false;
776            continue;
777        }
778        if ch == '\\' && (in_sq || in_dq) {
779            escape_next = true;
780            continue;
781        }
782        if in_sq {
783            if ch == '\'' {
784                in_sq = false;
785            }
786            continue;
787        }
788        if in_dq {
789            if ch == '"' {
790                in_dq = false;
791            }
792            continue;
793        }
794        match ch {
795            '\'' => in_sq = true,
796            '"' => in_dq = true,
797            '(' | '[' | '{' => depth = depth.saturating_add(1),
798            ')' | ']' | '}' => depth = depth.saturating_sub(1),
799            c if c == delim && depth == 0 => {
800                // i is always a char boundary from char_indices(); .get() is safe.
801                parts.push(s.get(start..i).unwrap_or(""));
802                start = i + ch.len_utf8();
803            }
804            _ => {}
805        }
806    }
807    parts.push(s.get(start..).unwrap_or(""));
808    parts
809}
810
811fn split_once_top_level(s: &str, delim: char) -> Option<(&str, &str)> {
812    split_index_top_level(s, delim).map(|idx| {
813        let left = s.get(..idx).unwrap_or("");
814        let right = s.get(idx + delim.len_utf8()..).unwrap_or("");
815        (left, right)
816    })
817}
818
819fn split_index_top_level(s: &str, delim: char) -> Option<usize> {
820    let mut depth: usize = 0;
821    let mut in_sq = false;
822    let mut in_dq = false;
823    let mut escape_next = false;
824
825    for (i, ch) in s.char_indices() {
826        if escape_next {
827            escape_next = false;
828            continue;
829        }
830        if ch == '\\' && (in_sq || in_dq) {
831            escape_next = true;
832            continue;
833        }
834        if in_sq {
835            if ch == '\'' {
836                in_sq = false;
837            }
838            continue;
839        }
840        if in_dq {
841            if ch == '"' {
842                in_dq = false;
843            }
844            continue;
845        }
846        match ch {
847            '\'' => in_sq = true,
848            '"' => in_dq = true,
849            '(' | '[' | '{' => depth = depth.saturating_add(1),
850            ')' | ']' | '}' => depth = depth.saturating_sub(1),
851            c if c == delim && depth == 0 => return Some(i),
852            _ => {}
853        }
854    }
855
856    None
857}
858
859// ── Unit tests ────────────────────────────────────────────────────────────────
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864    use crate::{LocalTool, ToolRegistry, ToolSchema};
865    use serde_json::json;
866    use std::sync::Arc;
867
868    // ── is_applicable ──────────────────────────────────────────────────────
869
870    #[test]
871    fn not_applicable_for_plain_text() {
872        assert!(!LfmNormalizer.is_applicable("hello world"));
873    }
874
875    #[test]
876    fn applicable_when_start_marker_present() {
877        assert!(
878            LfmNormalizer
879                .is_applicable("<|tool_call_start|>[get_weather(city='Berlin')]<|tool_call_end|>")
880        );
881    }
882
883    // ── normalize: clean inputs ────────────────────────────────────────────
884
885    #[test]
886    fn plain_text_returns_empty() {
887        let calls = LfmNormalizer
888            .normalize("The weather in Berlin is sunny.")
889            .unwrap();
890        assert!(calls.is_empty());
891    }
892
893    #[test]
894    fn single_call_string_arg() {
895        let raw = "<|tool_call_start|>[get_weather(city='Berlin')]<|tool_call_end|>";
896        let calls = LfmNormalizer.normalize(raw).unwrap();
897        assert_eq!(calls.len(), 1);
898        assert_eq!(calls[0].name, "get_weather");
899        assert_eq!(calls[0].args, json!({"city": "Berlin"}));
900    }
901
902    #[test]
903    fn single_call_multiple_args() {
904        let raw = "<|tool_call_start|>[search(query='rust async', limit=10)]<|tool_call_end|>";
905        let calls = LfmNormalizer.normalize(raw).unwrap();
906        assert_eq!(calls.len(), 1);
907        assert_eq!(calls[0].name, "search");
908        assert_eq!(calls[0].args, json!({"query": "rust async", "limit": 10}));
909    }
910
911    #[test]
912    fn single_call_no_args() {
913        let raw = "<|tool_call_start|>[list_tools()]<|tool_call_end|>";
914        let calls = LfmNormalizer.normalize(raw).unwrap();
915        assert_eq!(calls.len(), 1);
916        assert_eq!(calls[0].name, "list_tools");
917        assert_eq!(calls[0].args, json!({}));
918    }
919
920    #[test]
921    fn multiple_calls_in_one_block() {
922        let raw = "<|tool_call_start|>[get_weather(city='Berlin'), get_time(zone='UTC')]<|tool_call_end|>";
923        let calls = LfmNormalizer.normalize(raw).unwrap();
924        assert_eq!(calls.len(), 2);
925        assert_eq!(calls[0].name, "get_weather");
926        assert_eq!(calls[0].args, json!({"city": "Berlin"}));
927        assert_eq!(calls[1].name, "get_time");
928        assert_eq!(calls[1].args, json!({"zone": "UTC"}));
929    }
930
931    #[test]
932    fn multiple_blocks_in_one_message() {
933        let raw = concat!(
934            "<|tool_call_start|>[step_one(x=1)]<|tool_call_end|>",
935            " some text ",
936            "<|tool_call_start|>[step_two(y=2)]<|tool_call_end|>",
937        );
938        let calls = LfmNormalizer.normalize(raw).unwrap();
939        assert_eq!(calls.len(), 2);
940        assert_eq!(calls[0].name, "step_one");
941        assert_eq!(calls[1].name, "step_two");
942    }
943
944    #[test]
945    fn block_without_brackets_is_parsed() {
946        // Format without outer [ ] is also handled.
947        let raw = "<|tool_call_start|>ping(target='8.8.8.8')<|tool_call_end|>";
948        let calls = LfmNormalizer.normalize(raw).unwrap();
949        assert_eq!(calls.len(), 1);
950        assert_eq!(calls[0].name, "ping");
951        assert_eq!(calls[0].args, json!({"target": "8.8.8.8"}));
952    }
953
954    // ── value type coercion ────────────────────────────────────────────────
955
956    #[test]
957    fn integer_arg() {
958        let raw = "<|tool_call_start|>[set_limit(n=42)]<|tool_call_end|>";
959        let calls = LfmNormalizer.normalize(raw).unwrap();
960        assert_eq!(calls[0].args, json!({"n": 42}));
961    }
962
963    #[test]
964    fn float_arg() {
965        let raw = "<|tool_call_start|>[set_temp(t=0.7)]<|tool_call_end|>";
966        let calls = LfmNormalizer.normalize(raw).unwrap();
967        assert_eq!(calls[0].args["t"].as_f64().unwrap(), 0.7);
968    }
969
970    #[test]
971    fn boolean_args() {
972        let raw = "<|tool_call_start|>[configure(verbose=True, strict=False)]<|tool_call_end|>";
973        let calls = LfmNormalizer.normalize(raw).unwrap();
974        assert_eq!(calls[0].args, json!({"verbose": true, "strict": false}));
975    }
976
977    #[test]
978    fn null_args() {
979        let raw = "<|tool_call_start|>[reset(ctx=None)]<|tool_call_end|>";
980        let calls = LfmNormalizer.normalize(raw).unwrap();
981        assert_eq!(calls[0].args, json!({"ctx": null}));
982    }
983
984    #[test]
985    fn double_quoted_string_arg() {
986        let raw = r#"<|tool_call_start|>[greet(name="world")]<|tool_call_end|>"#;
987        let calls = LfmNormalizer.normalize(raw).unwrap();
988        assert_eq!(calls[0].args, json!({"name": "world"}));
989    }
990
991    #[test]
992    fn nested_list_and_object_args() {
993        let raw = "<|tool_call_start|>[plan(items=['a,b', 'c'], meta={'city': 'Berlin', 'coords': [52.52, 13.405], 'active': True})]<|tool_call_end|>";
994        let calls = LfmNormalizer.normalize(raw).unwrap();
995        assert_eq!(calls.len(), 1);
996        assert_eq!(
997            calls[0].args,
998            json!({
999                "items": ["a,b", "c"],
1000                "meta": {
1001                    "city": "Berlin",
1002                    "coords": [52.52, 13.405],
1003                    "active": true
1004                }
1005            })
1006        );
1007    }
1008
1009    #[test]
1010    fn openai_responses_function_call_item() {
1011        let value = json!({
1012            "type": "function_call",
1013            "id": "fc_123",
1014            "call_id": "call_123",
1015            "name": "get_weather",
1016            "arguments": "{\"city\":\"Berlin\"}",
1017            "status": "completed"
1018        });
1019
1020        let calls = StructuredToolCallNormalizer::normalize_openai_responses(&value).unwrap();
1021        assert_eq!(calls.len(), 1);
1022        assert_eq!(calls[0].name, "get_weather");
1023        assert_eq!(calls[0].args, json!({"city": "Berlin"}));
1024    }
1025
1026    #[test]
1027    fn openai_responses_full_response() {
1028        let value = json!({
1029            "id": "resp_123",
1030            "output": [
1031                { "type": "message", "content": [] },
1032                {
1033                    "type": "function_call",
1034                    "id": "fc_123",
1035                    "call_id": "call_123",
1036                    "name": "search.docs",
1037                    "arguments": {"query": "tool calls"},
1038                    "status": "completed"
1039                }
1040            ]
1041        });
1042
1043        let calls = StructuredToolCallNormalizer::normalize_openai_responses(&value).unwrap();
1044        assert_eq!(calls.len(), 1);
1045        assert_eq!(calls[0].name, "search.docs");
1046        assert_eq!(calls[0].args, json!({"query": "tool calls"}));
1047    }
1048
1049    #[test]
1050    fn openai_chat_completions_tool_calls() {
1051        let value = json!({
1052            "choices": [{
1053                "message": {
1054                    "role": "assistant",
1055                    "content": null,
1056                    "tool_calls": [{
1057                        "id": "call_123",
1058                        "type": "function",
1059                        "function": {
1060                            "name": "get_weather",
1061                            "arguments": "{\"city\":\"Berlin\"}"
1062                        }
1063                    }]
1064                }
1065            }]
1066        });
1067
1068        let calls =
1069            StructuredToolCallNormalizer::normalize_openai_chat_completions(&value).unwrap();
1070        assert_eq!(calls.len(), 1);
1071        assert_eq!(calls[0].name, "get_weather");
1072        assert_eq!(calls[0].args, json!({"city": "Berlin"}));
1073    }
1074
1075    #[test]
1076    fn structured_normalizer_aggregates_supported_shapes() {
1077        let responses_value = json!({
1078            "output": [{
1079                "type": "function_call",
1080                "name": "first",
1081                "arguments": "{}"
1082            }]
1083        });
1084        let chat_value = json!({
1085            "tool_calls": [{
1086                "function": {
1087                    "name": "second",
1088                    "arguments": {"ok": true}
1089                }
1090            }]
1091        });
1092
1093        let responses_calls = StructuredToolCallNormalizer::normalize(&responses_value).unwrap();
1094        let chat_calls = StructuredToolCallNormalizer::normalize(&chat_value).unwrap();
1095
1096        assert_eq!(responses_calls[0].name, "first");
1097        assert_eq!(chat_calls[0].name, "second");
1098        assert_eq!(chat_calls[0].args, json!({"ok": true}));
1099    }
1100
1101    // ── error paths ────────────────────────────────────────────────────────
1102
1103    #[test]
1104    fn unclosed_marker_returns_error() {
1105        let raw = "<|tool_call_start|>[get_weather(city='Berlin')]";
1106        let err = LfmNormalizer.normalize(raw).unwrap_err();
1107        let msg = err.to_string();
1108        assert!(msg.contains("unclosed"), "expected 'unclosed' in: {msg}");
1109    }
1110
1111    #[test]
1112    fn missing_paren_returns_error() {
1113        // Block with no '(' — not a valid call expression.
1114        let raw = "<|tool_call_start|>[not_a_call]<|tool_call_end|>";
1115        let err = LfmNormalizer.normalize(raw).unwrap_err();
1116        let msg = err.to_string();
1117        assert!(msg.contains("expected '('"), "got: {msg}");
1118    }
1119
1120    #[test]
1121    fn kwarg_without_equals_returns_error() {
1122        let raw = "<|tool_call_start|>[fn(badarg)]<|tool_call_end|>";
1123        let err = LfmNormalizer.normalize(raw).unwrap_err();
1124        let msg = err.to_string();
1125        assert!(msg.contains("kwarg without '='"), "got: {msg}");
1126    }
1127
1128    #[test]
1129    fn invalid_tool_name_returns_error() {
1130        let raw = "<|tool_call_start|>[bad/name(arg=1)]<|tool_call_end|>";
1131        let err = LfmNormalizer.normalize(raw).unwrap_err();
1132        let msg = err.to_string();
1133        assert!(msg.contains("invalid tool name"), "got: {msg}");
1134    }
1135
1136    #[test]
1137    fn empty_kwarg_name_returns_error() {
1138        let raw = "<|tool_call_start|>[fn(=1)]<|tool_call_end|>";
1139        let err = LfmNormalizer.normalize(raw).unwrap_err();
1140        let msg = err.to_string();
1141        assert!(msg.contains("empty kwarg name"), "got: {msg}");
1142    }
1143
1144    #[test]
1145    fn duplicate_kwarg_returns_error() {
1146        let raw = "<|tool_call_start|>[fn(city='Berlin', city='Paris')]<|tool_call_end|>";
1147        let err = LfmNormalizer.normalize(raw).unwrap_err();
1148        let msg = err.to_string();
1149        assert!(msg.contains("duplicate kwarg"), "got: {msg}");
1150    }
1151
1152    #[test]
1153    fn malformed_standard_arguments_return_error() {
1154        let value = json!({
1155            "type": "function_call",
1156            "name": "bad_args",
1157            "arguments": "{not json}"
1158        });
1159
1160        let err = StructuredToolCallNormalizer::normalize_openai_responses(&value).unwrap_err();
1161        let msg = err.to_string();
1162        assert!(msg.contains("arguments JSON"), "got: {msg}");
1163    }
1164
1165    #[test]
1166    fn trailing_call_content_returns_error() {
1167        let raw = "<|tool_call_start|>[fn(arg=1) extra]<|tool_call_end|>";
1168        let err = LfmNormalizer.normalize(raw).unwrap_err();
1169        let msg = err.to_string();
1170        assert!(msg.contains("trailing content"), "got: {msg}");
1171    }
1172
1173    #[test]
1174    fn unterminated_nested_literal_returns_error() {
1175        let raw = "<|tool_call_start|>[fn(items=['a', 'b')]<|tool_call_end|>";
1176        let err = LfmNormalizer.normalize(raw).unwrap_err();
1177        let msg = err.to_string();
1178        assert!(msg.contains("unterminated list"), "got: {msg}");
1179    }
1180
1181    #[tokio::test]
1182    async fn dispatch_invocations_runs_tools_in_order() {
1183        let tools = ToolRegistry::new();
1184        tools.register(Arc::new(LocalTool::new(
1185            ToolSchema {
1186                name: "echo".into(),
1187                description: "echoes args".into(),
1188                args_schema: json!({"type": "object"}),
1189                result_schema: json!({"type": "object"}),
1190            },
1191            |args| async move { Ok(json!({"seen": args})) },
1192        )));
1193
1194        let invocations = LfmNormalizer
1195            .normalize("<|tool_call_start|>[echo(value={'nested': [1, 2]})]<|tool_call_end|>")
1196            .unwrap();
1197        let results = dispatch_tool_invocations(&tools, &invocations)
1198            .await
1199            .unwrap();
1200
1201        assert_eq!(results.len(), 1);
1202        assert_eq!(results[0].invocation.name, "echo");
1203        assert_eq!(
1204            results[0].output,
1205            json!({"seen": {"value": {"nested": [1, 2]}}})
1206        );
1207    }
1208
1209    // ── split_top_level helper ─────────────────────────────────────────────
1210
1211    #[test]
1212    fn split_respects_parens() {
1213        // Comma inside parens must not split.
1214        let parts = split_top_level("fn(a, b), fn2(c)", ',');
1215        assert_eq!(parts, vec!["fn(a, b)", " fn2(c)"]);
1216    }
1217
1218    #[test]
1219    fn split_respects_single_quotes() {
1220        let parts = split_top_level("a='x,y', b=2", ',');
1221        assert_eq!(parts, vec!["a='x,y'", " b=2"]);
1222    }
1223
1224    #[test]
1225    fn split_respects_nested_arrays_and_objects() {
1226        let parts = split_top_level("a=[1, 2], b={'x': 'y,z'}, c=3", ',');
1227        assert_eq!(parts, vec!["a=[1, 2]", " b={'x': 'y,z'}", " c=3"]);
1228    }
1229}
rig_compose/normalizer.rs

rig_compose/
normalizer.rs