Skip to main content

sgr_agent/
oxide_chat_client.rs

1//! OxideChatClient — LlmClient via Chat Completions API (not Responses).
2//!
3//! For OpenAI-compatible endpoints that don't support /responses:
4//! Cloudflare AI Gateway compat, OpenRouter, local models, Workers AI.
5
6use crate::client::LlmClient;
7use crate::tool::ToolDef;
8use crate::types::{LlmConfig, Message, Role, SgrError, ToolCall};
9use openai_oxide::OpenAI;
10use openai_oxide::config::ClientConfig;
11use openai_oxide::types::chat::*;
12use serde_json::Value;
13
14/// Record OTEL span for Chat Completions API call via shared telemetry helper.
15/// AI-NOTE: OxideChatClient is the primary client for Nemotron (Cloudflare Workers AI).
16#[cfg(feature = "telemetry")]
17fn record_chat_otel(
18    model: &str,
19    messages: &[Message],
20    usage: Option<&openai_oxide::types::chat::Usage>,
21    tool_calls: &[ToolCall],
22    text_output: &str,
23) {
24    let (pt, ct, cached) = usage
25        .map(|u| {
26            let pt = u.prompt_tokens.unwrap_or(0);
27            let ct = u.completion_tokens.unwrap_or(0);
28            let cached = u
29                .prompt_tokens_details
30                .as_ref()
31                .and_then(|d| d.cached_tokens)
32                .unwrap_or(0);
33            (pt, ct, cached)
34        })
35        .unwrap_or((0, 0, 0));
36
37    let input = last_user_content(messages, 500);
38    let output = truncate_str(text_output, 500);
39    let tc: Vec<(String, String)> = tool_calls
40        .iter()
41        .map(|tc| (tc.name.clone(), tc.arguments.to_string()))
42        .collect();
43
44    crate::telemetry::record_llm_span(
45        "chat.completions.api",
46        model,
47        &input,
48        &output,
49        &tc,
50        &crate::telemetry::LlmUsage {
51            prompt_tokens: pt,
52            completion_tokens: ct,
53            cached_tokens: cached,
54            response_model: model.to_string(),
55        },
56    );
57}
58
59#[cfg(not(feature = "telemetry"))]
60fn record_chat_otel(
61    _model: &str,
62    _messages: &[Message],
63    _usage: Option<&openai_oxide::types::chat::Usage>,
64    _tool_calls: &[ToolCall],
65    _text: &str,
66) {
67}
68
69#[cfg(feature = "telemetry")]
70fn last_user_content(messages: &[Message], max_len: usize) -> String {
71    messages
72        .iter()
73        .rev()
74        .find(|m| matches!(m.role, Role::User | Role::Tool))
75        .map(|m| truncate_str(&m.content, max_len))
76        .unwrap_or_default()
77}
78
79#[cfg(feature = "telemetry")]
80fn truncate_str(s: &str, max_len: usize) -> String {
81    if s.len() > max_len {
82        format!("{}...", &s[..max_len])
83    } else {
84        s.to_string()
85    }
86}
87
88/// LlmClient backed by openai-oxide Chat Completions API.
89pub struct OxideChatClient {
90    client: OpenAI,
91    pub(crate) model: String,
92    pub(crate) temperature: Option<f64>,
93    pub(crate) max_tokens: Option<u32>,
94    /// Reasoning effort — None disables reasoning for FC (DeepInfra Nemotron Super).
95    pub(crate) reasoning_effort: Option<openai_oxide::types::chat::ReasoningEffort>,
96    /// Server-side prompt prefix caching key (DeepInfra, OpenAI).
97    pub(crate) prompt_cache_key: Option<String>,
98}
99
100impl OxideChatClient {
101    /// Create from LlmConfig.
102    pub fn from_config(config: &LlmConfig) -> Result<Self, SgrError> {
103        let api_key = config
104            .api_key
105            .clone()
106            .or_else(|| std::env::var("OPENAI_API_KEY").ok())
107            .unwrap_or_else(|| {
108                if config.base_url.is_some() {
109                    "dummy_key".into()
110                } else {
111                    "".into()
112                }
113            });
114
115        if api_key.is_empty() {
116            return Err(SgrError::Schema("No API key for oxide chat client".into()));
117        }
118
119        let mut client_config = ClientConfig::new(&api_key);
120        if let Some(ref url) = config.base_url {
121            client_config = client_config.base_url(url.clone());
122        }
123        config.apply_headers(&mut client_config);
124
125        let reasoning_effort = config.reasoning_effort.as_deref().and_then(|s| match s {
126            "none" => Some(openai_oxide::types::chat::ReasoningEffort::None),
127            "low" => Some(openai_oxide::types::chat::ReasoningEffort::Low),
128            "medium" => Some(openai_oxide::types::chat::ReasoningEffort::Medium),
129            "high" => Some(openai_oxide::types::chat::ReasoningEffort::High),
130            _ => None,
131        });
132
133        Ok(Self {
134            client: OpenAI::with_config(client_config),
135            model: config.model.clone(),
136            temperature: Some(config.temp),
137            max_tokens: config.max_tokens,
138            reasoning_effort,
139            prompt_cache_key: config.prompt_cache_key.clone(),
140        })
141    }
142
143    fn build_messages(&self, messages: &[Message]) -> Vec<ChatCompletionMessageParam> {
144        messages
145            .iter()
146            .map(|m| match m.role {
147                Role::System => ChatCompletionMessageParam::System {
148                    content: m.content.clone(),
149                    name: None,
150                },
151                Role::User => ChatCompletionMessageParam::User {
152                    content: UserContent::Text(m.content.clone()),
153                    name: None,
154                },
155                Role::Assistant => {
156                    let tc = if m.tool_calls.is_empty() {
157                        None
158                    } else {
159                        Some(
160                            m.tool_calls
161                                .iter()
162                                .map(|tc| openai_oxide::types::chat::ToolCall {
163                                    id: tc.id.clone(),
164                                    type_: "function".into(),
165                                    function: openai_oxide::types::chat::FunctionCall {
166                                        name: tc.name.clone(),
167                                        arguments: tc.arguments.to_string(),
168                                    },
169                                })
170                                .collect(),
171                        )
172                    };
173                    ChatCompletionMessageParam::Assistant {
174                        content: if m.content.is_empty() {
175                            None
176                        } else {
177                            Some(m.content.clone())
178                        },
179                        name: None,
180                        tool_calls: tc,
181                        refusal: None,
182                    }
183                }
184                Role::Tool => ChatCompletionMessageParam::Tool {
185                    content: m.content.clone(),
186                    tool_call_id: m.tool_call_id.clone().unwrap_or_default(),
187                },
188            })
189            .collect()
190    }
191
192    fn build_request(&self, messages: &[Message]) -> ChatCompletionRequest {
193        self.build_request_with_reasoning(messages, self.reasoning_effort.as_ref())
194    }
195
196    fn build_request_no_reasoning(&self, messages: &[Message]) -> ChatCompletionRequest {
197        // Force reasoning off for action/tool execution calls (faster + cache friendly)
198        if self.reasoning_effort.is_some() {
199            self.build_request_with_reasoning(
200                messages,
201                Some(&openai_oxide::types::chat::ReasoningEffort::None),
202            )
203        } else {
204            self.build_request_with_reasoning(messages, None)
205        }
206    }
207
208    fn build_request_with_reasoning(
209        &self,
210        messages: &[Message],
211        reasoning: Option<&openai_oxide::types::chat::ReasoningEffort>,
212    ) -> ChatCompletionRequest {
213        let mut req = ChatCompletionRequest::new(&self.model, self.build_messages(messages));
214        if let Some(temp) = self.temperature {
215            req.temperature = Some(temp);
216        }
217        if let Some(max) = self.max_tokens {
218            if self.model.starts_with("gpt-5") || self.model.starts_with("o") {
219                req = req.max_completion_tokens(max as i64);
220            } else {
221                req.max_tokens = Some(max as i64);
222            }
223        }
224        if let Some(effort) = reasoning {
225            req.reasoning_effort = Some(effort.clone());
226        }
227        if let Some(ref key) = self.prompt_cache_key {
228            req.prompt_cache_key = Some(key.clone());
229        }
230        req
231    }
232
233    fn extract_tool_calls(response: &ChatCompletionResponse) -> Vec<ToolCall> {
234        let Some(choice) = response.choices.first() else {
235            return Vec::new();
236        };
237        let Some(ref calls) = choice.message.tool_calls else {
238            return Vec::new();
239        };
240        calls
241            .iter()
242            .map(|tc| ToolCall {
243                id: tc.id.clone(),
244                name: tc.function.name.clone(),
245                arguments: serde_json::from_str(&tc.function.arguments).unwrap_or(Value::Null),
246            })
247            .collect()
248    }
249}
250
251#[async_trait::async_trait]
252impl LlmClient for OxideChatClient {
253    async fn structured_call(
254        &self,
255        messages: &[Message],
256        schema: &Value,
257    ) -> Result<(Option<Value>, Vec<ToolCall>, String), SgrError> {
258        // Skip ensure_strict for pre-strict schemas (e.g., from build_action_schema)
259        let strict_schema =
260            if schema.get("additionalProperties").and_then(|v| v.as_bool()) == Some(false) {
261                schema.clone()
262            } else {
263                let mut s = schema.clone();
264                openai_oxide::parsing::ensure_strict(&mut s);
265                s
266            };
267
268        let mut req = self.build_request(messages);
269        req.response_format = Some(ResponseFormat::JsonSchema {
270            json_schema: JsonSchema {
271                name: "response".into(),
272                description: None,
273                schema: Some(strict_schema),
274                strict: Some(true),
275            },
276        });
277
278        let response = self
279            .client
280            .chat()
281            .completions()
282            .create(req)
283            .await
284            .map_err(|e| SgrError::Api {
285                status: 0,
286                body: e.to_string(),
287            })?;
288
289        let raw_text = response
290            .choices
291            .first()
292            .and_then(|c| c.message.content.clone())
293            .unwrap_or_default();
294        let tool_calls = Self::extract_tool_calls(&response);
295        let parsed = serde_json::from_str::<Value>(&raw_text).ok();
296
297        if let Some(ref usage) = response.usage {
298            let input = usage.prompt_tokens.unwrap_or(0);
299            let cached = usage
300                .prompt_tokens_details
301                .as_ref()
302                .and_then(|d| d.cached_tokens)
303                .unwrap_or(0);
304            let output = usage.completion_tokens.unwrap_or(0);
305            if cached > 0 {
306                let pct = if input > 0 { cached * 100 / input } else { 0 };
307                eprintln!(
308                    "    💰 {}in/{}out (cached: {}, {}%)",
309                    input, output, cached, pct
310                );
311            } else {
312                eprintln!("    💰 {}in/{}out", input, output);
313            }
314        }
315
316        record_chat_otel(
317            &self.model,
318            messages,
319            response.usage.as_ref(),
320            &tool_calls,
321            &raw_text,
322        );
323        Ok((parsed, tool_calls, raw_text))
324    }
325
326    async fn tools_call(
327        &self,
328        messages: &[Message],
329        tools: &[ToolDef],
330    ) -> Result<Vec<ToolCall>, SgrError> {
331        // No reasoning for tool execution — faster + better cache hit
332        let mut req = self.build_request_no_reasoning(messages);
333
334        let chat_tools: Vec<Tool> = tools
335            .iter()
336            .map(|t| {
337                Tool::function(
338                    &t.name,
339                    if t.description.is_empty() {
340                        "No description"
341                    } else {
342                        &t.description
343                    },
344                    t.parameters.clone(),
345                )
346            })
347            .collect();
348        req.tools = Some(chat_tools);
349        req.tool_choice = Some(openai_oxide::types::chat::ToolChoice::Mode(
350            "required".into(),
351        ));
352
353        let response = self
354            .client
355            .chat()
356            .completions()
357            .create(req)
358            .await
359            .map_err(|e| SgrError::Api {
360                status: 0,
361                body: e.to_string(),
362            })?;
363
364        if let Some(ref usage) = response.usage {
365            let input = usage.prompt_tokens.unwrap_or(0);
366            let cached = usage
367                .prompt_tokens_details
368                .as_ref()
369                .and_then(|d| d.cached_tokens)
370                .unwrap_or(0);
371            let output = usage.completion_tokens.unwrap_or(0);
372            if cached > 0 {
373                let pct = if input > 0 { cached * 100 / input } else { 0 };
374                eprintln!(
375                    "    💰 {}in/{}out (cached: {}, {}%)",
376                    input, output, cached, pct
377                );
378            } else {
379                eprintln!("    💰 {}in/{}out", input, output);
380            }
381        }
382
383        let calls = Self::extract_tool_calls(&response);
384        record_chat_otel(&self.model, messages, response.usage.as_ref(), &calls, "");
385        // Don't synthesize finish — empty tool_calls signals completion to ToolCallingAgent.
386        Ok(calls)
387    }
388
389    async fn complete(&self, messages: &[Message]) -> Result<String, SgrError> {
390        let req = self.build_request(messages);
391
392        let response = self
393            .client
394            .chat()
395            .completions()
396            .create(req)
397            .await
398            .map_err(|e| SgrError::Api {
399                status: 0,
400                body: e.to_string(),
401            })?;
402
403        tracing::info!(model = %response.model, "oxide_chat.complete");
404
405        let text = response
406            .choices
407            .first()
408            .and_then(|c| c.message.content.clone())
409            .unwrap_or_default();
410        record_chat_otel(&self.model, messages, response.usage.as_ref(), &[], &text);
411        Ok(text)
412    }
413}