Skip to main content

sgr_agent/
oxide_client.rs

1//! OxideClient — LlmClient adapter for `openai-oxide` crate.
2//!
3//! Uses the **Responses API** (`POST /responses`) instead of Chat Completions.
4//! With `oxide-ws` feature: persistent WebSocket connection for -20-25% latency.
5//! Supports: structured output (json_schema), function calling, multi-turn (previous_response_id).
6
7use crate::client::LlmClient;
8use crate::multimodal;
9use crate::tool::ToolDef;
10use crate::types::{LlmConfig, Message, Role, SgrError, ToolCall};
11use openai_oxide::OpenAI;
12use openai_oxide::config::ClientConfig;
13use openai_oxide::types::responses::*;
14use serde_json::Value;
15
16/// Record OTEL span for Responses API call via shared telemetry helper.
17#[cfg(feature = "telemetry")]
18fn record_otel_usage(response: &Response, model: &str, messages: &[Message]) {
19    let pt = response
20        .usage
21        .as_ref()
22        .and_then(|u| u.input_tokens)
23        .unwrap_or(0);
24    let ct = response
25        .usage
26        .as_ref()
27        .and_then(|u| u.output_tokens)
28        .unwrap_or(0);
29    let cached = response
30        .usage
31        .as_ref()
32        .and_then(|u| u.input_tokens_details.as_ref())
33        .and_then(|d| d.cached_tokens)
34        .unwrap_or(0);
35
36    let input = last_user_content(messages, 500);
37    let output_text = response.output_text();
38    let output = truncate_str(&output_text, 500);
39    let tool_calls: Vec<(String, String)> = response
40        .function_calls()
41        .iter()
42        .map(|fc| (fc.name.clone(), fc.arguments.to_string()))
43        .collect();
44
45    crate::telemetry::record_llm_span(
46        "oxide.responses.api",
47        model,
48        &input,
49        &output,
50        &tool_calls,
51        &crate::telemetry::LlmUsage {
52            prompt_tokens: pt,
53            completion_tokens: ct,
54            cached_tokens: cached,
55            response_model: response.model.clone(),
56        },
57    );
58}
59
60#[cfg(not(feature = "telemetry"))]
61fn record_otel_usage(_response: &Response, _model: &str, _messages: &[Message]) {}
62
63#[cfg(feature = "telemetry")]
64fn last_user_content(messages: &[Message], max_len: usize) -> String {
65    messages
66        .iter()
67        .rev()
68        .find(|m| matches!(m.role, Role::User | Role::Tool))
69        .map(|m| truncate_str(&m.content, max_len))
70        .unwrap_or_default()
71}
72
73#[cfg(feature = "telemetry")]
74fn truncate_str(s: &str, max_len: usize) -> String {
75    use crate::str_ext::StrExt;
76    let t = s.trunc(max_len);
77    if t.len() < s.len() {
78        format!("{t}...")
79    } else {
80        s.to_string()
81    }
82}
83
84/// LlmClient backed by openai-oxide (Responses API).
85///
86/// With `oxide-ws` feature: call `connect_ws()` to upgrade to WebSocket mode.
87/// All subsequent calls go over persistent wss:// connection (-20-25% latency).
88pub struct OxideClient {
89    client: OpenAI,
90    pub(crate) model: String,
91    pub(crate) temperature: Option<f64>,
92    pub(crate) max_tokens: Option<u32>,
93    /// `text.verbosity` for Responses API ("low" | "medium" | "high").
94    /// `None` = let the API default apply.
95    pub(crate) verbosity: Option<String>,
96    /// `reasoning.effort` for Responses API on reasoning models.
97    /// "none" | "minimal" | "low" | "medium" | "high" | "xhigh".
98    /// `None` = let the model default apply (gpt-5* default = "medium").
99    pub(crate) reasoning_effort: Option<openai_oxide::types::common::ReasoningEffort>,
100    /// WebSocket session (when oxide-ws feature is enabled and connected).
101    #[cfg(feature = "oxide-ws")]
102    ws: tokio::sync::Mutex<Option<openai_oxide::websocket::WsSession>>,
103    /// Lazy WS: true = connect on first request, false = HTTP only.
104    #[cfg(feature = "oxide-ws")]
105    ws_enabled: std::sync::atomic::AtomicBool,
106}
107
108/// OpenAI reasoning models reject the `temperature` parameter on the Responses API.
109/// Detect by model id prefix — covers gpt-5, gpt-5.x, o1, o3, o4 families.
110pub(crate) fn model_supports_temperature(model: &str) -> bool {
111    let m = model.strip_prefix("openai/").unwrap_or(model);
112    !(m.starts_with("gpt-5") || m.starts_with("o1") || m.starts_with("o3") || m.starts_with("o4"))
113}
114
115impl OxideClient {
116    /// Create from LlmConfig.
117    pub fn from_config(config: &LlmConfig) -> Result<Self, SgrError> {
118        let api_key = config
119            .api_key
120            .clone()
121            .or_else(|| std::env::var("OPENAI_API_KEY").ok())
122            .unwrap_or_else(|| {
123                if config.base_url.is_some() {
124                    "dummy_key".into()
125                } else {
126                    "".into()
127                }
128            });
129
130        if api_key.is_empty() {
131            return Err(SgrError::Schema("No API key for oxide client".into()));
132        }
133
134        let mut client_config = ClientConfig::new(&api_key);
135        if let Some(ref url) = config.base_url {
136            client_config = client_config.base_url(url.clone());
137        }
138        config.apply_headers(&mut client_config);
139
140        let reasoning_effort = config.reasoning_effort.as_deref().and_then(|s| {
141            use openai_oxide::types::common::ReasoningEffort;
142            match s {
143                "none" => Some(ReasoningEffort::None),
144                "minimal" => Some(ReasoningEffort::Minimal),
145                "low" => Some(ReasoningEffort::Low),
146                "medium" => Some(ReasoningEffort::Medium),
147                "high" => Some(ReasoningEffort::High),
148                "xhigh" => Some(ReasoningEffort::Xhigh),
149                _ => None,
150            }
151        });
152
153        Ok(Self {
154            client: OpenAI::with_config(client_config),
155            model: config.model.clone(),
156            temperature: Some(config.temp),
157            max_tokens: config.max_tokens,
158            verbosity: config.verbosity.clone(),
159            reasoning_effort,
160            #[cfg(feature = "oxide-ws")]
161            ws: tokio::sync::Mutex::new(None),
162            #[cfg(feature = "oxide-ws")]
163            ws_enabled: std::sync::atomic::AtomicBool::new(false),
164        })
165    }
166
167    /// Enable WebSocket mode — lazy connect on first request.
168    ///
169    /// Does NOT open a connection immediately. The WS connection is established
170    /// on the first `send_request_auto()` call, eliminating idle timeout issues.
171    /// Falls back to HTTP automatically if WS fails.
172    ///
173    /// Requires `oxide-ws` feature.
174    #[cfg(feature = "oxide-ws")]
175    pub async fn connect_ws(&self) -> Result<(), SgrError> {
176        self.ws_enabled
177            .store(true, std::sync::atomic::Ordering::Relaxed);
178        tracing::info!(model = %self.model, "oxide WebSocket enabled (lazy connect)");
179        Ok(())
180    }
181
182    /// Send request — lazy WS connect + send, falls back to HTTP on any WS error.
183    async fn send_request_auto(
184        &self,
185        request: ResponseCreateRequest,
186    ) -> Result<Response, SgrError> {
187        #[cfg(feature = "oxide-ws")]
188        if self.ws_enabled.load(std::sync::atomic::Ordering::Relaxed) {
189            let mut ws_guard = self.ws.lock().await;
190
191            // Lazy connect
192            if ws_guard.is_none() {
193                match self.client.ws_session().await {
194                    Ok(session) => {
195                        tracing::info!(model = %self.model, "oxide WS connected (lazy)");
196                        *ws_guard = Some(session);
197                    }
198                    Err(e) => {
199                        tracing::warn!("oxide WS connect failed, using HTTP: {e}");
200                        self.ws_enabled
201                            .store(false, std::sync::atomic::Ordering::Relaxed);
202                    }
203                }
204            }
205
206            if let Some(ref mut session) = *ws_guard {
207                match session.send(request.clone()).await {
208                    Ok(response) => return Ok(response),
209                    Err(e) => {
210                        tracing::warn!("oxide WS send failed, falling back to HTTP: {e}");
211                        *ws_guard = None;
212                    }
213                }
214            }
215        }
216
217        // HTTP fallback
218        self.client
219            .responses()
220            .create(request)
221            .await
222            .map_err(|e| SgrError::Api {
223                status: 0,
224                body: e.to_string(),
225            })
226    }
227
228    /// Build a ResponseCreateRequest from messages + optional schema + optional chaining.
229    ///
230    /// - `previous_response_id` is None: full history as Messages format
231    /// - `previous_response_id` is Some: Items format with function_call_output
232    ///   (required for chaining after tool calls via Responses API)
233    /// - `schema`: optional structured output json_schema config
234    pub(crate) fn build_request(
235        &self,
236        messages: &[Message],
237        schema: Option<&Value>,
238        previous_response_id: Option<&str>,
239    ) -> ResponseCreateRequest {
240        if previous_response_id.is_some() {
241            // Items format: messages + function_call_output items.
242            // HTTP API accepts Messages format (without type), but WS API requires it.
243            // Using Items consistently ensures both HTTP and WS work.
244            return self.build_request_items(messages, previous_response_id);
245        }
246
247        // Messages format: standard request with optional structured output
248        let mut input_items = Vec::new();
249
250        for msg in messages {
251            match msg.role {
252                Role::System => {
253                    input_items.push(ResponseInputItem {
254                        role: openai_oxide::types::common::Role::System,
255                        content: Value::String(msg.content.clone()),
256                    });
257                }
258                Role::User => {
259                    let content = if msg.images.is_empty() {
260                        Value::String(msg.content.clone())
261                    } else {
262                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
263                            .unwrap_or_else(|_| Value::String(msg.content.clone()))
264                    };
265                    input_items.push(ResponseInputItem {
266                        role: openai_oxide::types::common::Role::User,
267                        content,
268                    });
269                }
270                Role::Assistant => {
271                    // Include tool call info so structured_call context shows
272                    // what action was taken.
273                    let mut content = msg.content.clone();
274                    if !msg.tool_calls.is_empty() {
275                        for tc in &msg.tool_calls {
276                            let args = tc.arguments.to_string();
277                            let preview = if args.len() > 200 {
278                                use crate::str_ext::StrExt;
279                                args.trunc(200)
280                            } else {
281                                &args
282                            };
283                            content.push_str(&format!("\n→ {}({})", tc.name, preview));
284                        }
285                    }
286                    input_items.push(ResponseInputItem {
287                        role: openai_oxide::types::common::Role::Assistant,
288                        content: Value::String(content),
289                    });
290                }
291                Role::Tool => {
292                    // Clean format — no "[Tool result for ...]" prefix.
293                    // The assistant message above already has the action name.
294                    input_items.push(ResponseInputItem {
295                        role: openai_oxide::types::common::Role::User,
296                        content: Value::String(msg.content.clone()),
297                    });
298                }
299            }
300        }
301
302        let mut req = ResponseCreateRequest::new(&self.model);
303
304        // Set input — prefer simple text when single user message (fewer tokens)
305        if input_items.len() == 1 && input_items[0].role == openai_oxide::types::common::Role::User
306        {
307            if let Some(text) = input_items[0].content.as_str() {
308                req = req.input(text);
309            } else {
310                req.input = Some(ResponseInput::Messages(input_items));
311            }
312        } else if !input_items.is_empty() {
313            req.input = Some(ResponseInput::Messages(input_items));
314        }
315
316        // Temperature — skip for reasoning models (gpt-5*, o1/o3/o4) which reject it,
317        // and skip the default 1.0 to reduce payload.
318        if let Some(temp) = self.temperature
319            && (temp - 1.0).abs() > f64::EPSILON
320            && model_supports_temperature(&self.model)
321        {
322            req = req.temperature(temp);
323        }
324
325        // Max tokens
326        if let Some(max) = self.max_tokens {
327            req = req.max_output_tokens(max as i64);
328        }
329
330        // Reasoning effort (gpt-5*, o-series). Without an explicit effort, gpt-5*
331        // defaults to "medium" which can blow through the output budget on hidden
332        // reasoning before any tool call is emitted.
333        if let Some(ref effort) = self.reasoning_effort {
334            req.reasoning = Some(openai_oxide::types::responses::Reasoning {
335                effort: Some(effort.clone()),
336                summary: None,
337            });
338        }
339
340        // Structured output via json_schema (and/or verbosity passthrough)
341        match (schema, self.verbosity.clone()) {
342            (Some(schema_val), v) => {
343                req = req.text(ResponseTextConfig {
344                    format: Some(ResponseTextFormat::JsonSchema {
345                        name: "sgr_response".into(),
346                        description: None,
347                        schema: Some(schema_val.clone()),
348                        strict: Some(true),
349                    }),
350                    verbosity: v,
351                });
352            }
353            (None, Some(v)) => {
354                req = req.text(ResponseTextConfig {
355                    format: None,
356                    verbosity: Some(v),
357                });
358            }
359            (None, None) => {}
360        }
361
362        req
363    }
364
365    /// Build Items-format request for stateful chaining with previous_response_id.
366    fn build_request_items(
367        &self,
368        messages: &[Message],
369        previous_response_id: Option<&str>,
370    ) -> ResponseCreateRequest {
371        use openai_oxide::types::responses::ResponseInput;
372
373        let mut items: Vec<Value> = Vec::new();
374
375        for msg in messages {
376            match msg.role {
377                Role::Tool => {
378                    if let Some(ref call_id) = msg.tool_call_id {
379                        items.push(serde_json::json!({
380                            "type": "function_call_output",
381                            "call_id": call_id,
382                            "output": msg.content
383                        }));
384                    }
385                }
386                Role::System => {
387                    items.push(serde_json::json!({
388                        "type": "message",
389                        "role": "system",
390                        "content": msg.content
391                    }));
392                }
393                Role::User => {
394                    let content = if msg.images.is_empty() {
395                        serde_json::json!(msg.content)
396                    } else {
397                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
398                            .unwrap_or_else(|_| serde_json::json!(msg.content))
399                    };
400                    items.push(serde_json::json!({
401                        "type": "message",
402                        "role": "user",
403                        "content": content,
404                    }));
405                }
406                Role::Assistant => {
407                    items.push(serde_json::json!({
408                        "type": "message",
409                        "role": "assistant",
410                        "content": msg.content
411                    }));
412                }
413            }
414        }
415
416        let mut req = ResponseCreateRequest::new(&self.model);
417        if !items.is_empty() {
418            req.input = Some(ResponseInput::Items(items));
419        }
420
421        // Temperature — same gating as the structured-output path above.
422        if let Some(temp) = self.temperature
423            && (temp - 1.0).abs() > f64::EPSILON
424            && model_supports_temperature(&self.model)
425        {
426            req = req.temperature(temp);
427        }
428        if let Some(max) = self.max_tokens {
429            req = req.max_output_tokens(max as i64);
430        }
431
432        if let Some(v) = self.verbosity.clone() {
433            req = req.text(ResponseTextConfig {
434                format: None,
435                verbosity: Some(v),
436            });
437        }
438
439        if let Some(prev_id) = previous_response_id {
440            req = req.previous_response_id(prev_id);
441        }
442
443        req
444    }
445
446    /// Function calling with explicit previous_response_id.
447    /// Returns tool calls + new response_id for chaining.
448    ///
449    /// Always sets `store(true)` so responses can be referenced by subsequent calls.
450    /// When `previous_response_id` is provided, only delta messages need to be sent
451    /// (server has full history from previous stored response).
452    ///
453    /// Tool messages (role=Tool with tool_call_id) are converted to Responses API
454    /// `function_call_output` items — required for chaining with previous_response_id.
455    ///
456    /// This method does NOT use the Mutex — all state is explicit via parameters/return.
457    async fn tools_call_stateful_impl(
458        &self,
459        messages: &[Message],
460        tools: &[ToolDef],
461        previous_response_id: Option<&str>,
462    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
463        let mut req = self.build_request(messages, None, previous_response_id);
464        // Always store so next call can chain via previous_response_id
465        req = req.store(true);
466
467        // Convert ToolDefs to ResponseTools with strict mode.
468        // strict: true guarantees LLM output matches schema exactly (no parse errors).
469        // oxide ensure_strict() handles: additionalProperties, all-required,
470        // nullable→anyOf, allOf inlining, oneOf→anyOf.
471        let response_tools: Vec<ResponseTool> = tools
472            .iter()
473            .map(|t| {
474                let mut params = t.parameters.clone();
475                openai_oxide::parsing::ensure_strict(&mut params);
476                ResponseTool::Function {
477                    name: t.name.clone(),
478                    description: if t.description.is_empty() {
479                        None
480                    } else {
481                        Some(t.description.clone())
482                    },
483                    parameters: Some(params),
484                    strict: Some(true),
485                }
486            })
487            .collect();
488        req = req.tools(response_tools);
489
490        let response = self.send_request_auto(req).await?;
491
492        let response_id = response.id.clone();
493        // No Mutex save — caller owns the response_id
494        record_otel_usage(&response, &self.model, messages);
495
496        let input_tokens = response
497            .usage
498            .as_ref()
499            .and_then(|u| u.input_tokens)
500            .unwrap_or(0);
501        let cached_tokens = response
502            .usage
503            .as_ref()
504            .and_then(|u| u.input_tokens_details.as_ref())
505            .and_then(|d| d.cached_tokens)
506            .unwrap_or(0);
507
508        let chained = previous_response_id.is_some();
509        let cache_pct = if input_tokens > 0 {
510            (cached_tokens * 100) / input_tokens
511        } else {
512            0
513        };
514
515        tracing::info!(
516            model = %response.model,
517            response_id = %response_id,
518            input_tokens,
519            cached_tokens,
520            cache_pct,
521            chained,
522            "oxide.tools_call_stateful"
523        );
524
525        if cached_tokens > 0 {
526            eprintln!(
527                "    💰 {}in/{}out (cached: {}, {}%)",
528                input_tokens,
529                response
530                    .usage
531                    .as_ref()
532                    .and_then(|u| u.output_tokens)
533                    .unwrap_or(0),
534                cached_tokens,
535                cache_pct
536            );
537        } else {
538            eprintln!(
539                "    💰 {}in/{}out",
540                input_tokens,
541                response
542                    .usage
543                    .as_ref()
544                    .and_then(|u| u.output_tokens)
545                    .unwrap_or(0)
546            );
547        }
548
549        Self::check_truncation(&response)?;
550        Ok((Self::extract_tool_calls(&response), Some(response_id)))
551    }
552
553    /// Check if response was truncated due to max_output_tokens.
554    /// Returns Err(MaxOutputTokens) if truncated, Ok(()) otherwise.
555    fn check_truncation(response: &Response) -> Result<(), SgrError> {
556        let is_incomplete = response
557            .status
558            .as_deref()
559            .is_some_and(|s| s == "incomplete");
560        let is_max_tokens = response
561            .incomplete_details
562            .as_ref()
563            .and_then(|d| d.reason.as_deref())
564            .is_some_and(|r| r == "max_output_tokens");
565
566        if is_incomplete && is_max_tokens {
567            return Err(SgrError::MaxOutputTokens {
568                partial_content: response.output_text(),
569            });
570        }
571        Ok(())
572    }
573
574    /// Extract tool calls from Responses API output items.
575    fn extract_tool_calls(response: &Response) -> Vec<ToolCall> {
576        response
577            .function_calls()
578            .into_iter()
579            .map(|fc| ToolCall {
580                id: fc.call_id,
581                name: fc.name,
582                arguments: fc.arguments,
583            })
584            .collect()
585    }
586}
587
588#[async_trait::async_trait]
589impl LlmClient for OxideClient {
590    async fn structured_call(
591        &self,
592        messages: &[Message],
593        schema: &Value,
594    ) -> Result<(Option<Value>, Vec<ToolCall>, String), SgrError> {
595        // Make schema OpenAI-strict — UNLESS it's already strict
596        // (build_action_schema produces pre-strict schemas that ensure_strict would break)
597        let strict_schema =
598            if schema.get("additionalProperties").and_then(|v| v.as_bool()) == Some(false) {
599                // Already strict-compatible (e.g., from build_action_schema)
600                schema.clone()
601            } else {
602                let mut s = schema.clone();
603                openai_oxide::parsing::ensure_strict(&mut s);
604                s
605            };
606
607        // Stateless — build request with full message history, no chaining.
608        // store(true) enables server-side prompt caching for stable prefix.
609        let mut req = self.build_request(messages, Some(&strict_schema), None);
610        req = req.store(true);
611
612        let span = tracing::info_span!(
613            "oxide.responses.create",
614            model = %self.model,
615            method = "structured_call",
616        );
617        let _enter = span.enter();
618
619        // Debug: dump schema on first call
620        if std::env::var("SGR_DEBUG_SCHEMA").is_ok()
621            && let Some(ref text_cfg) = req.text
622        {
623            eprintln!(
624                "[sgr] Schema: {}",
625                serde_json::to_string(text_cfg).unwrap_or_default()
626            );
627        }
628
629        let response = self.send_request_auto(req).await?;
630
631        // No Mutex save — structured_call is stateless
632        record_otel_usage(&response, &self.model, messages);
633
634        Self::check_truncation(&response)?;
635
636        let raw_text = response.output_text();
637        if std::env::var("SGR_DEBUG").is_ok() {
638            eprintln!("[sgr] Raw response: {}", {
639                use crate::str_ext::StrExt;
640                raw_text.trunc(500)
641            });
642        }
643        let tool_calls = Self::extract_tool_calls(&response);
644        let parsed = serde_json::from_str::<Value>(&raw_text).ok();
645
646        let input_tokens = response
647            .usage
648            .as_ref()
649            .and_then(|u| u.input_tokens)
650            .unwrap_or(0);
651        let cached_tokens = response
652            .usage
653            .as_ref()
654            .and_then(|u| u.input_tokens_details.as_ref())
655            .and_then(|d| d.cached_tokens)
656            .unwrap_or(0);
657        let cache_pct = if input_tokens > 0 {
658            (cached_tokens * 100) / input_tokens
659        } else {
660            0
661        };
662
663        {
664            let output_tokens = response
665                .usage
666                .as_ref()
667                .and_then(|u| u.output_tokens)
668                .unwrap_or(0);
669            if cached_tokens > 0 {
670                eprintln!(
671                    "    💰 {}in/{}out (cached: {}, {}%)",
672                    input_tokens, output_tokens, cached_tokens, cache_pct
673                );
674            } else {
675                eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
676            }
677        }
678
679        Ok((parsed, tool_calls, raw_text))
680    }
681
682    async fn tools_call(
683        &self,
684        messages: &[Message],
685        tools: &[ToolDef],
686    ) -> Result<Vec<ToolCall>, SgrError> {
687        // Stateless — no previous_response_id, full message history.
688        // store(true) enables server-side prompt caching: OpenAI auto-caches
689        // the stable prefix (system prompt + tools) for requests >1024 tokens.
690        let mut req = self.build_request(messages, None, None);
691        req = req.store(true);
692
693        // Convert ToolDefs to ResponseTools — no strict mode (faster server-side)
694        let response_tools: Vec<ResponseTool> = tools
695            .iter()
696            .map(|t| ResponseTool::Function {
697                name: t.name.clone(),
698                description: if t.description.is_empty() {
699                    None
700                } else {
701                    Some(t.description.clone())
702                },
703                parameters: Some(t.parameters.clone()),
704                strict: None, // AI-NOTE: strict=true breaks tools with optional params
705            })
706            .collect();
707        req = req.tools(response_tools);
708
709        // Force model to always call a tool — prevents text-only responses
710        // that lose answer content (tools_call only returns Vec<ToolCall>).
711        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
712            "required".into(),
713        ));
714        // AI-NOTE: explicit parallel_tool_calls=true for OpenAI. Anthropic models on OpenRouter
715        // reject this param (404); they use `disable_parallel_tool_use` natively, not exposed here.
716        if !self.model.contains("anthropic/") && !self.model.starts_with("claude") {
717            req = req.parallel_tool_calls(true);
718        }
719
720        let response = self.send_request_auto(req).await?;
721
722        record_otel_usage(&response, &self.model, messages);
723        Self::check_truncation(&response)?;
724
725        let input_tokens = response
726            .usage
727            .as_ref()
728            .and_then(|u| u.input_tokens)
729            .unwrap_or(0);
730        let cached_tokens = response
731            .usage
732            .as_ref()
733            .and_then(|u| u.input_tokens_details.as_ref())
734            .and_then(|d| d.cached_tokens)
735            .unwrap_or(0);
736        let cache_pct = if input_tokens > 0 {
737            (cached_tokens * 100) / input_tokens
738        } else {
739            0
740        };
741
742        if cached_tokens > 0 {
743            eprintln!(
744                "    💰 {}in/{}out (cached: {}, {}%)",
745                input_tokens,
746                response
747                    .usage
748                    .as_ref()
749                    .and_then(|u| u.output_tokens)
750                    .unwrap_or(0),
751                cached_tokens,
752                cache_pct
753            );
754        } else {
755            eprintln!(
756                "    💰 {}in/{}out",
757                input_tokens,
758                response
759                    .usage
760                    .as_ref()
761                    .and_then(|u| u.output_tokens)
762                    .unwrap_or(0)
763            );
764        }
765
766        let calls = Self::extract_tool_calls(&response);
767        Ok(calls)
768    }
769
770    async fn tools_call_stateful(
771        &self,
772        messages: &[Message],
773        tools: &[ToolDef],
774        previous_response_id: Option<&str>,
775    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
776        self.tools_call_stateful_impl(messages, tools, previous_response_id)
777            .await
778    }
779
780    /// tool_choice=auto so model can emit reasoning text ALONGSIDE tool calls in one response.
781    /// Returns (tool_calls, reasoning_text). Used by single-phase agent to get 1 LLM call/step.
782    async fn tools_call_with_text(
783        &self,
784        messages: &[Message],
785        tools: &[ToolDef],
786    ) -> Result<(Vec<ToolCall>, String), SgrError> {
787        let mut req = self.build_request(messages, None, None);
788        req = req.store(true);
789
790        let response_tools: Vec<ResponseTool> = tools
791            .iter()
792            .map(|t| ResponseTool::Function {
793                name: t.name.clone(),
794                description: if t.description.is_empty() {
795                    None
796                } else {
797                    Some(t.description.clone())
798                },
799                parameters: Some(t.parameters.clone()),
800                strict: None,
801            })
802            .collect();
803        req = req.tools(response_tools);
804        // AI-NOTE: tool_choice=auto (not required) — model can return text+tools in same response.
805        // This is the key for single-phase: reasoning in text, action in tool calls, 1 LLM call.
806        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
807            "auto".into(),
808        ));
809        req = req.parallel_tool_calls(true);
810
811        let response = self.send_request_auto(req).await?;
812
813        record_otel_usage(&response, &self.model, messages);
814        Self::check_truncation(&response)?;
815
816        let input_tokens = response
817            .usage
818            .as_ref()
819            .and_then(|u| u.input_tokens)
820            .unwrap_or(0);
821        let cached_tokens = response
822            .usage
823            .as_ref()
824            .and_then(|u| u.input_tokens_details.as_ref())
825            .and_then(|d| d.cached_tokens)
826            .unwrap_or(0);
827        let cache_pct = if input_tokens > 0 {
828            (cached_tokens * 100) / input_tokens
829        } else {
830            0
831        };
832        let output_tokens = response
833            .usage
834            .as_ref()
835            .and_then(|u| u.output_tokens)
836            .unwrap_or(0);
837        if cached_tokens > 0 {
838            eprintln!(
839                "    💰 {}in/{}out (cached: {}, {}%)",
840                input_tokens, output_tokens, cached_tokens, cache_pct
841            );
842        } else {
843            eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
844        }
845
846        let text = response.output_text();
847        let calls = Self::extract_tool_calls(&response);
848        Ok((calls, text))
849    }
850
851    async fn complete(&self, messages: &[Message]) -> Result<String, SgrError> {
852        let mut req = self.build_request(messages, None, None);
853        req = req.store(true);
854
855        let response = self.send_request_auto(req).await?;
856
857        record_otel_usage(&response, &self.model, messages);
858        Self::check_truncation(&response)?;
859
860        let text = response.output_text();
861        if text.is_empty() {
862            return Err(SgrError::EmptyResponse);
863        }
864
865        tracing::info!(
866            model = %response.model,
867            response_id = %response.id,
868            input_tokens = response.usage.as_ref().and_then(|u| u.input_tokens).unwrap_or(0),
869            output_tokens = response.usage.as_ref().and_then(|u| u.output_tokens).unwrap_or(0),
870            "oxide.complete"
871        );
872
873        Ok(text)
874    }
875}
876
877#[cfg(test)]
878mod tests {
879    use super::*;
880    use crate::types::ImagePart;
881
882    #[test]
883    fn oxide_client_from_config() {
884        // Just test construction doesn't panic
885        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
886        let client = OxideClient::from_config(&config).unwrap();
887        assert_eq!(client.model, "gpt-5.4");
888    }
889
890    #[test]
891    fn build_request_simple() {
892        let config = LlmConfig::with_key("sk-test", "gpt-4o-mini").temperature(0.5);
893        let client = OxideClient::from_config(&config).unwrap();
894        let messages = vec![Message::system("Be helpful."), Message::user("Hello")];
895        let req = client.build_request(&messages, None, None);
896        assert_eq!(req.model, "gpt-4o-mini");
897        assert!(req.instructions.is_none());
898        assert!(req.input.is_some());
899        assert_eq!(req.temperature, Some(0.5));
900    }
901
902    #[test]
903    fn temperature_skipped_for_reasoning_models() {
904        // gpt-5*, o1, o3, o4 reject the `temperature` param on the Responses API.
905        for model in [
906            "gpt-5",
907            "gpt-5-mini",
908            "gpt-5.4",
909            "gpt-5.5",
910            "o1-mini",
911            "o3",
912            "o4-mini",
913        ] {
914            let config = LlmConfig::with_key("sk-test", model).temperature(0.7);
915            let client = OxideClient::from_config(&config).unwrap();
916            let req = client.build_request(&[Message::user("Hi")], None, None);
917            assert_eq!(
918                req.temperature, None,
919                "temperature must be omitted for reasoning model `{model}`"
920            );
921        }
922    }
923
924    #[test]
925    fn model_supports_temperature_classification() {
926        assert!(model_supports_temperature("gpt-4o"));
927        assert!(model_supports_temperature("gpt-4o-mini"));
928        assert!(model_supports_temperature("gpt-4.1"));
929        assert!(model_supports_temperature("anthropic/claude-sonnet-4.6"));
930        assert!(!model_supports_temperature("gpt-5"));
931        assert!(!model_supports_temperature("gpt-5-mini"));
932        assert!(!model_supports_temperature("gpt-5.4"));
933        assert!(!model_supports_temperature("gpt-5.5"));
934        assert!(!model_supports_temperature("openai/gpt-5.5"));
935        assert!(!model_supports_temperature("o1-preview"));
936        assert!(!model_supports_temperature("o3-mini"));
937        assert!(!model_supports_temperature("o4-mini"));
938    }
939
940    #[test]
941    fn build_request_with_schema() {
942        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
943        let client = OxideClient::from_config(&config).unwrap();
944        let schema = serde_json::json!({
945            "type": "object",
946            "properties": {"answer": {"type": "string"}},
947            "required": ["answer"]
948        });
949        let req = client.build_request(&[Message::user("Hi")], Some(&schema), None);
950        assert!(req.text.is_some());
951    }
952
953    #[test]
954    fn build_request_stateless_no_previous_response_id() {
955        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
956        let client = OxideClient::from_config(&config).unwrap();
957
958        let req = client.build_request(&[Message::user("Hi")], None, None);
959        assert!(
960            req.previous_response_id.is_none(),
961            "build_request must be stateless when no explicit ID"
962        );
963    }
964
965    #[test]
966    fn build_request_explicit_chaining() {
967        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
968        let client = OxideClient::from_config(&config).unwrap();
969
970        // With previous_response_id — uses Items format for chaining
971        let req = client.build_request(&[Message::user("Hi")], None, Some("resp_xyz"));
972        assert_eq!(
973            req.previous_response_id.as_deref(),
974            Some("resp_xyz"),
975            "build_request should chain with explicit previous_response_id"
976        );
977    }
978
979    #[test]
980    fn build_request_tool_outputs_chaining() {
981        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
982        let client = OxideClient::from_config(&config).unwrap();
983
984        // With previous_response_id — tool outputs as function_call_output items
985        let messages = vec![Message::tool("call_1", "result data")];
986        let req = client.build_request(&messages, None, Some("resp_123"));
987        assert_eq!(req.previous_response_id.as_deref(), Some("resp_123"));
988
989        // Without previous_response_id
990        let req = client.build_request(&messages, None, None);
991        assert!(
992            req.previous_response_id.is_none(),
993            "build_request must be stateless when no explicit ID"
994        );
995    }
996
997    #[test]
998    fn build_request_multimodal_user() {
999        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
1000        let client = OxideClient::from_config(&config).unwrap();
1001        let img = ImagePart {
1002            data: "AAAA".into(),
1003            mime_type: "image/jpeg".into(),
1004        };
1005        let messages = vec![Message::user_with_images("Describe this", vec![img])];
1006        let req = client.build_request(&messages, None, None);
1007
1008        // Single user with images must serialize as content-parts array, not string
1009        let input = req.input.as_ref().expect("input missing");
1010        let serialized = serde_json::to_value(input).unwrap();
1011        let s = serde_json::to_string(&serialized).unwrap();
1012        assert!(s.contains("input_text"), "missing input_text part: {s}");
1013        assert!(s.contains("input_image"), "missing input_image part: {s}");
1014        assert!(
1015            s.contains("data:image/jpeg;base64,AAAA"),
1016            "missing data URL: {s}"
1017        );
1018    }
1019
1020    #[test]
1021    fn build_request_items_multimodal_user() {
1022        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
1023        let client = OxideClient::from_config(&config).unwrap();
1024        let img = ImagePart {
1025            data: "BBBB".into(),
1026            mime_type: "image/png".into(),
1027        };
1028        let messages = vec![Message::user_with_images("What's on screen?", vec![img])];
1029        // previous_response_id triggers items-format path
1030        let req = client.build_request(&messages, None, Some("resp_prev"));
1031
1032        let input = req.input.as_ref().expect("input missing");
1033        let s = serde_json::to_string(input).unwrap();
1034        assert!(
1035            s.contains("input_text"),
1036            "items path missing input_text: {s}"
1037        );
1038        assert!(
1039            s.contains("input_image"),
1040            "items path missing input_image: {s}"
1041        );
1042        assert!(
1043            s.contains("data:image/png;base64,BBBB"),
1044            "items path missing data URL: {s}"
1045        );
1046    }
1047}