Skip to main content

sgr_agent/
oxide_client.rs

1//! OxideClient — LlmClient adapter for `openai-oxide` crate.
2//!
3//! Uses the **Responses API** (`POST /responses`) instead of Chat Completions.
4//! With `oxide-ws` feature: persistent WebSocket connection for -20-25% latency.
5//! Supports: structured output (json_schema), function calling, multi-turn (previous_response_id).
6
7use crate::client::LlmClient;
8use crate::multimodal;
9use crate::tool::ToolDef;
10use crate::types::{LlmConfig, Message, Role, SgrError, ToolCall};
11use openai_oxide::OpenAI;
12use openai_oxide::config::ClientConfig;
13use openai_oxide::types::responses::*;
14use serde_json::Value;
15
16/// Record OTEL span for Responses API call via shared telemetry helper.
17#[cfg(feature = "telemetry")]
18fn record_otel_usage(response: &Response, model: &str, messages: &[Message]) {
19    let pt = response
20        .usage
21        .as_ref()
22        .and_then(|u| u.input_tokens)
23        .unwrap_or(0);
24    let ct = response
25        .usage
26        .as_ref()
27        .and_then(|u| u.output_tokens)
28        .unwrap_or(0);
29    let cached = response
30        .usage
31        .as_ref()
32        .and_then(|u| u.input_tokens_details.as_ref())
33        .and_then(|d| d.cached_tokens)
34        .unwrap_or(0);
35
36    let input = last_user_content(messages, 500);
37    let output_text = response.output_text();
38    let output = truncate_str(&output_text, 500);
39    let tool_calls: Vec<(String, String)> = response
40        .function_calls()
41        .iter()
42        .map(|fc| (fc.name.clone(), fc.arguments.to_string()))
43        .collect();
44
45    crate::telemetry::record_llm_span(
46        "oxide.responses.api",
47        model,
48        &input,
49        &output,
50        &tool_calls,
51        &crate::telemetry::LlmUsage {
52            prompt_tokens: pt,
53            completion_tokens: ct,
54            cached_tokens: cached,
55            response_model: response.model.clone(),
56        },
57    );
58}
59
60#[cfg(not(feature = "telemetry"))]
61fn record_otel_usage(_response: &Response, _model: &str, _messages: &[Message]) {}
62
63#[cfg(feature = "telemetry")]
64fn last_user_content(messages: &[Message], max_len: usize) -> String {
65    messages
66        .iter()
67        .rev()
68        .find(|m| matches!(m.role, Role::User | Role::Tool))
69        .map(|m| truncate_str(&m.content, max_len))
70        .unwrap_or_default()
71}
72
73#[cfg(feature = "telemetry")]
74fn truncate_str(s: &str, max_len: usize) -> String {
75    use crate::str_ext::StrExt;
76    let t = s.trunc(max_len);
77    if t.len() < s.len() {
78        format!("{t}...")
79    } else {
80        s.to_string()
81    }
82}
83
84/// LlmClient backed by openai-oxide (Responses API).
85///
86/// With `oxide-ws` feature: call `connect_ws()` to upgrade to WebSocket mode.
87/// All subsequent calls go over persistent wss:// connection (-20-25% latency).
88pub struct OxideClient {
89    client: OpenAI,
90    pub(crate) model: String,
91    pub(crate) temperature: Option<f64>,
92    pub(crate) max_tokens: Option<u32>,
93    /// `text.verbosity` for Responses API ("low" | "medium" | "high").
94    /// `None` = let the API default apply.
95    pub(crate) verbosity: Option<String>,
96    /// WebSocket session (when oxide-ws feature is enabled and connected).
97    #[cfg(feature = "oxide-ws")]
98    ws: tokio::sync::Mutex<Option<openai_oxide::websocket::WsSession>>,
99    /// Lazy WS: true = connect on first request, false = HTTP only.
100    #[cfg(feature = "oxide-ws")]
101    ws_enabled: std::sync::atomic::AtomicBool,
102}
103
104/// OpenAI reasoning models reject the `temperature` parameter on the Responses API.
105/// Detect by model id prefix — covers gpt-5, gpt-5.x, o1, o3, o4 families.
106pub(crate) fn model_supports_temperature(model: &str) -> bool {
107    let m = model.strip_prefix("openai/").unwrap_or(model);
108    !(m.starts_with("gpt-5") || m.starts_with("o1") || m.starts_with("o3") || m.starts_with("o4"))
109}
110
111impl OxideClient {
112    /// Create from LlmConfig.
113    pub fn from_config(config: &LlmConfig) -> Result<Self, SgrError> {
114        let api_key = config
115            .api_key
116            .clone()
117            .or_else(|| std::env::var("OPENAI_API_KEY").ok())
118            .unwrap_or_else(|| {
119                if config.base_url.is_some() {
120                    "dummy_key".into()
121                } else {
122                    "".into()
123                }
124            });
125
126        if api_key.is_empty() {
127            return Err(SgrError::Schema("No API key for oxide client".into()));
128        }
129
130        let mut client_config = ClientConfig::new(&api_key);
131        if let Some(ref url) = config.base_url {
132            client_config = client_config.base_url(url.clone());
133        }
134        config.apply_headers(&mut client_config);
135
136        Ok(Self {
137            client: OpenAI::with_config(client_config),
138            model: config.model.clone(),
139            temperature: Some(config.temp),
140            max_tokens: config.max_tokens,
141            verbosity: config.verbosity.clone(),
142            #[cfg(feature = "oxide-ws")]
143            ws: tokio::sync::Mutex::new(None),
144            #[cfg(feature = "oxide-ws")]
145            ws_enabled: std::sync::atomic::AtomicBool::new(false),
146        })
147    }
148
149    /// Enable WebSocket mode — lazy connect on first request.
150    ///
151    /// Does NOT open a connection immediately. The WS connection is established
152    /// on the first `send_request_auto()` call, eliminating idle timeout issues.
153    /// Falls back to HTTP automatically if WS fails.
154    ///
155    /// Requires `oxide-ws` feature.
156    #[cfg(feature = "oxide-ws")]
157    pub async fn connect_ws(&self) -> Result<(), SgrError> {
158        self.ws_enabled
159            .store(true, std::sync::atomic::Ordering::Relaxed);
160        tracing::info!(model = %self.model, "oxide WebSocket enabled (lazy connect)");
161        Ok(())
162    }
163
164    /// Send request — lazy WS connect + send, falls back to HTTP on any WS error.
165    async fn send_request_auto(
166        &self,
167        request: ResponseCreateRequest,
168    ) -> Result<Response, SgrError> {
169        #[cfg(feature = "oxide-ws")]
170        if self.ws_enabled.load(std::sync::atomic::Ordering::Relaxed) {
171            let mut ws_guard = self.ws.lock().await;
172
173            // Lazy connect
174            if ws_guard.is_none() {
175                match self.client.ws_session().await {
176                    Ok(session) => {
177                        tracing::info!(model = %self.model, "oxide WS connected (lazy)");
178                        *ws_guard = Some(session);
179                    }
180                    Err(e) => {
181                        tracing::warn!("oxide WS connect failed, using HTTP: {e}");
182                        self.ws_enabled
183                            .store(false, std::sync::atomic::Ordering::Relaxed);
184                    }
185                }
186            }
187
188            if let Some(ref mut session) = *ws_guard {
189                match session.send(request.clone()).await {
190                    Ok(response) => return Ok(response),
191                    Err(e) => {
192                        tracing::warn!("oxide WS send failed, falling back to HTTP: {e}");
193                        *ws_guard = None;
194                    }
195                }
196            }
197        }
198
199        // HTTP fallback
200        self.client
201            .responses()
202            .create(request)
203            .await
204            .map_err(|e| SgrError::Api {
205                status: 0,
206                body: e.to_string(),
207            })
208    }
209
210    /// Build a ResponseCreateRequest from messages + optional schema + optional chaining.
211    ///
212    /// - `previous_response_id` is None: full history as Messages format
213    /// - `previous_response_id` is Some: Items format with function_call_output
214    ///   (required for chaining after tool calls via Responses API)
215    /// - `schema`: optional structured output json_schema config
216    pub(crate) fn build_request(
217        &self,
218        messages: &[Message],
219        schema: Option<&Value>,
220        previous_response_id: Option<&str>,
221    ) -> ResponseCreateRequest {
222        if previous_response_id.is_some() {
223            // Items format: messages + function_call_output items.
224            // HTTP API accepts Messages format (without type), but WS API requires it.
225            // Using Items consistently ensures both HTTP and WS work.
226            return self.build_request_items(messages, previous_response_id);
227        }
228
229        // Messages format: standard request with optional structured output
230        let mut input_items = Vec::new();
231
232        for msg in messages {
233            match msg.role {
234                Role::System => {
235                    input_items.push(ResponseInputItem {
236                        role: openai_oxide::types::common::Role::System,
237                        content: Value::String(msg.content.clone()),
238                    });
239                }
240                Role::User => {
241                    let content = if msg.images.is_empty() {
242                        Value::String(msg.content.clone())
243                    } else {
244                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
245                            .unwrap_or_else(|_| Value::String(msg.content.clone()))
246                    };
247                    input_items.push(ResponseInputItem {
248                        role: openai_oxide::types::common::Role::User,
249                        content,
250                    });
251                }
252                Role::Assistant => {
253                    // Include tool call info so structured_call context shows
254                    // what action was taken.
255                    let mut content = msg.content.clone();
256                    if !msg.tool_calls.is_empty() {
257                        for tc in &msg.tool_calls {
258                            let args = tc.arguments.to_string();
259                            let preview = if args.len() > 200 {
260                                use crate::str_ext::StrExt;
261                                args.trunc(200)
262                            } else {
263                                &args
264                            };
265                            content.push_str(&format!("\n→ {}({})", tc.name, preview));
266                        }
267                    }
268                    input_items.push(ResponseInputItem {
269                        role: openai_oxide::types::common::Role::Assistant,
270                        content: Value::String(content),
271                    });
272                }
273                Role::Tool => {
274                    // Clean format — no "[Tool result for ...]" prefix.
275                    // The assistant message above already has the action name.
276                    input_items.push(ResponseInputItem {
277                        role: openai_oxide::types::common::Role::User,
278                        content: Value::String(msg.content.clone()),
279                    });
280                }
281            }
282        }
283
284        let mut req = ResponseCreateRequest::new(&self.model);
285
286        // Set input — prefer simple text when single user message (fewer tokens)
287        if input_items.len() == 1 && input_items[0].role == openai_oxide::types::common::Role::User
288        {
289            if let Some(text) = input_items[0].content.as_str() {
290                req = req.input(text);
291            } else {
292                req.input = Some(ResponseInput::Messages(input_items));
293            }
294        } else if !input_items.is_empty() {
295            req.input = Some(ResponseInput::Messages(input_items));
296        }
297
298        // Temperature — skip for reasoning models (gpt-5*, o1/o3/o4) which reject it,
299        // and skip the default 1.0 to reduce payload.
300        if let Some(temp) = self.temperature
301            && (temp - 1.0).abs() > f64::EPSILON
302            && model_supports_temperature(&self.model)
303        {
304            req = req.temperature(temp);
305        }
306
307        // Max tokens
308        if let Some(max) = self.max_tokens {
309            req = req.max_output_tokens(max as i64);
310        }
311
312        // Structured output via json_schema (and/or verbosity passthrough)
313        match (schema, self.verbosity.clone()) {
314            (Some(schema_val), v) => {
315                req = req.text(ResponseTextConfig {
316                    format: Some(ResponseTextFormat::JsonSchema {
317                        name: "sgr_response".into(),
318                        description: None,
319                        schema: Some(schema_val.clone()),
320                        strict: Some(true),
321                    }),
322                    verbosity: v,
323                });
324            }
325            (None, Some(v)) => {
326                req = req.text(ResponseTextConfig {
327                    format: None,
328                    verbosity: Some(v),
329                });
330            }
331            (None, None) => {}
332        }
333
334        req
335    }
336
337    /// Build Items-format request for stateful chaining with previous_response_id.
338    fn build_request_items(
339        &self,
340        messages: &[Message],
341        previous_response_id: Option<&str>,
342    ) -> ResponseCreateRequest {
343        use openai_oxide::types::responses::ResponseInput;
344
345        let mut items: Vec<Value> = Vec::new();
346
347        for msg in messages {
348            match msg.role {
349                Role::Tool => {
350                    if let Some(ref call_id) = msg.tool_call_id {
351                        items.push(serde_json::json!({
352                            "type": "function_call_output",
353                            "call_id": call_id,
354                            "output": msg.content
355                        }));
356                    }
357                }
358                Role::System => {
359                    items.push(serde_json::json!({
360                        "type": "message",
361                        "role": "system",
362                        "content": msg.content
363                    }));
364                }
365                Role::User => {
366                    let content = if msg.images.is_empty() {
367                        serde_json::json!(msg.content)
368                    } else {
369                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
370                            .unwrap_or_else(|_| serde_json::json!(msg.content))
371                    };
372                    items.push(serde_json::json!({
373                        "type": "message",
374                        "role": "user",
375                        "content": content,
376                    }));
377                }
378                Role::Assistant => {
379                    items.push(serde_json::json!({
380                        "type": "message",
381                        "role": "assistant",
382                        "content": msg.content
383                    }));
384                }
385            }
386        }
387
388        let mut req = ResponseCreateRequest::new(&self.model);
389        if !items.is_empty() {
390            req.input = Some(ResponseInput::Items(items));
391        }
392
393        // Temperature — same gating as the structured-output path above.
394        if let Some(temp) = self.temperature
395            && (temp - 1.0).abs() > f64::EPSILON
396            && model_supports_temperature(&self.model)
397        {
398            req = req.temperature(temp);
399        }
400        if let Some(max) = self.max_tokens {
401            req = req.max_output_tokens(max as i64);
402        }
403
404        if let Some(v) = self.verbosity.clone() {
405            req = req.text(ResponseTextConfig {
406                format: None,
407                verbosity: Some(v),
408            });
409        }
410
411        if let Some(prev_id) = previous_response_id {
412            req = req.previous_response_id(prev_id);
413        }
414
415        req
416    }
417
418    /// Function calling with explicit previous_response_id.
419    /// Returns tool calls + new response_id for chaining.
420    ///
421    /// Always sets `store(true)` so responses can be referenced by subsequent calls.
422    /// When `previous_response_id` is provided, only delta messages need to be sent
423    /// (server has full history from previous stored response).
424    ///
425    /// Tool messages (role=Tool with tool_call_id) are converted to Responses API
426    /// `function_call_output` items — required for chaining with previous_response_id.
427    ///
428    /// This method does NOT use the Mutex — all state is explicit via parameters/return.
429    async fn tools_call_stateful_impl(
430        &self,
431        messages: &[Message],
432        tools: &[ToolDef],
433        previous_response_id: Option<&str>,
434    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
435        let mut req = self.build_request(messages, None, previous_response_id);
436        // Always store so next call can chain via previous_response_id
437        req = req.store(true);
438
439        // Convert ToolDefs to ResponseTools with strict mode.
440        // strict: true guarantees LLM output matches schema exactly (no parse errors).
441        // oxide ensure_strict() handles: additionalProperties, all-required,
442        // nullable→anyOf, allOf inlining, oneOf→anyOf.
443        let response_tools: Vec<ResponseTool> = tools
444            .iter()
445            .map(|t| {
446                let mut params = t.parameters.clone();
447                openai_oxide::parsing::ensure_strict(&mut params);
448                ResponseTool::Function {
449                    name: t.name.clone(),
450                    description: if t.description.is_empty() {
451                        None
452                    } else {
453                        Some(t.description.clone())
454                    },
455                    parameters: Some(params),
456                    strict: Some(true),
457                }
458            })
459            .collect();
460        req = req.tools(response_tools);
461
462        let response = self.send_request_auto(req).await?;
463
464        let response_id = response.id.clone();
465        // No Mutex save — caller owns the response_id
466        record_otel_usage(&response, &self.model, messages);
467
468        let input_tokens = response
469            .usage
470            .as_ref()
471            .and_then(|u| u.input_tokens)
472            .unwrap_or(0);
473        let cached_tokens = response
474            .usage
475            .as_ref()
476            .and_then(|u| u.input_tokens_details.as_ref())
477            .and_then(|d| d.cached_tokens)
478            .unwrap_or(0);
479
480        let chained = previous_response_id.is_some();
481        let cache_pct = if input_tokens > 0 {
482            (cached_tokens * 100) / input_tokens
483        } else {
484            0
485        };
486
487        tracing::info!(
488            model = %response.model,
489            response_id = %response_id,
490            input_tokens,
491            cached_tokens,
492            cache_pct,
493            chained,
494            "oxide.tools_call_stateful"
495        );
496
497        if cached_tokens > 0 {
498            eprintln!(
499                "    💰 {}in/{}out (cached: {}, {}%)",
500                input_tokens,
501                response
502                    .usage
503                    .as_ref()
504                    .and_then(|u| u.output_tokens)
505                    .unwrap_or(0),
506                cached_tokens,
507                cache_pct
508            );
509        } else {
510            eprintln!(
511                "    💰 {}in/{}out",
512                input_tokens,
513                response
514                    .usage
515                    .as_ref()
516                    .and_then(|u| u.output_tokens)
517                    .unwrap_or(0)
518            );
519        }
520
521        Self::check_truncation(&response)?;
522        Ok((Self::extract_tool_calls(&response), Some(response_id)))
523    }
524
525    /// Check if response was truncated due to max_output_tokens.
526    /// Returns Err(MaxOutputTokens) if truncated, Ok(()) otherwise.
527    fn check_truncation(response: &Response) -> Result<(), SgrError> {
528        let is_incomplete = response
529            .status
530            .as_deref()
531            .is_some_and(|s| s == "incomplete");
532        let is_max_tokens = response
533            .incomplete_details
534            .as_ref()
535            .and_then(|d| d.reason.as_deref())
536            .is_some_and(|r| r == "max_output_tokens");
537
538        if is_incomplete && is_max_tokens {
539            return Err(SgrError::MaxOutputTokens {
540                partial_content: response.output_text(),
541            });
542        }
543        Ok(())
544    }
545
546    /// Extract tool calls from Responses API output items.
547    fn extract_tool_calls(response: &Response) -> Vec<ToolCall> {
548        response
549            .function_calls()
550            .into_iter()
551            .map(|fc| ToolCall {
552                id: fc.call_id,
553                name: fc.name,
554                arguments: fc.arguments,
555            })
556            .collect()
557    }
558}
559
560#[async_trait::async_trait]
561impl LlmClient for OxideClient {
562    async fn structured_call(
563        &self,
564        messages: &[Message],
565        schema: &Value,
566    ) -> Result<(Option<Value>, Vec<ToolCall>, String), SgrError> {
567        // Make schema OpenAI-strict — UNLESS it's already strict
568        // (build_action_schema produces pre-strict schemas that ensure_strict would break)
569        let strict_schema =
570            if schema.get("additionalProperties").and_then(|v| v.as_bool()) == Some(false) {
571                // Already strict-compatible (e.g., from build_action_schema)
572                schema.clone()
573            } else {
574                let mut s = schema.clone();
575                openai_oxide::parsing::ensure_strict(&mut s);
576                s
577            };
578
579        // Stateless — build request with full message history, no chaining.
580        // store(true) enables server-side prompt caching for stable prefix.
581        let mut req = self.build_request(messages, Some(&strict_schema), None);
582        req = req.store(true);
583
584        let span = tracing::info_span!(
585            "oxide.responses.create",
586            model = %self.model,
587            method = "structured_call",
588        );
589        let _enter = span.enter();
590
591        // Debug: dump schema on first call
592        if std::env::var("SGR_DEBUG_SCHEMA").is_ok()
593            && let Some(ref text_cfg) = req.text
594        {
595            eprintln!(
596                "[sgr] Schema: {}",
597                serde_json::to_string(text_cfg).unwrap_or_default()
598            );
599        }
600
601        let response = self.send_request_auto(req).await?;
602
603        // No Mutex save — structured_call is stateless
604        record_otel_usage(&response, &self.model, messages);
605
606        Self::check_truncation(&response)?;
607
608        let raw_text = response.output_text();
609        if std::env::var("SGR_DEBUG").is_ok() {
610            eprintln!("[sgr] Raw response: {}", {
611                use crate::str_ext::StrExt;
612                raw_text.trunc(500)
613            });
614        }
615        let tool_calls = Self::extract_tool_calls(&response);
616        let parsed = serde_json::from_str::<Value>(&raw_text).ok();
617
618        let input_tokens = response
619            .usage
620            .as_ref()
621            .and_then(|u| u.input_tokens)
622            .unwrap_or(0);
623        let cached_tokens = response
624            .usage
625            .as_ref()
626            .and_then(|u| u.input_tokens_details.as_ref())
627            .and_then(|d| d.cached_tokens)
628            .unwrap_or(0);
629        let cache_pct = if input_tokens > 0 {
630            (cached_tokens * 100) / input_tokens
631        } else {
632            0
633        };
634
635        {
636            let output_tokens = response
637                .usage
638                .as_ref()
639                .and_then(|u| u.output_tokens)
640                .unwrap_or(0);
641            if cached_tokens > 0 {
642                eprintln!(
643                    "    💰 {}in/{}out (cached: {}, {}%)",
644                    input_tokens, output_tokens, cached_tokens, cache_pct
645                );
646            } else {
647                eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
648            }
649        }
650
651        Ok((parsed, tool_calls, raw_text))
652    }
653
654    async fn tools_call(
655        &self,
656        messages: &[Message],
657        tools: &[ToolDef],
658    ) -> Result<Vec<ToolCall>, SgrError> {
659        // Stateless — no previous_response_id, full message history.
660        // store(true) enables server-side prompt caching: OpenAI auto-caches
661        // the stable prefix (system prompt + tools) for requests >1024 tokens.
662        let mut req = self.build_request(messages, None, None);
663        req = req.store(true);
664
665        // Convert ToolDefs to ResponseTools — no strict mode (faster server-side)
666        let response_tools: Vec<ResponseTool> = tools
667            .iter()
668            .map(|t| ResponseTool::Function {
669                name: t.name.clone(),
670                description: if t.description.is_empty() {
671                    None
672                } else {
673                    Some(t.description.clone())
674                },
675                parameters: Some(t.parameters.clone()),
676                strict: None, // AI-NOTE: strict=true breaks tools with optional params
677            })
678            .collect();
679        req = req.tools(response_tools);
680
681        // Force model to always call a tool — prevents text-only responses
682        // that lose answer content (tools_call only returns Vec<ToolCall>).
683        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
684            "required".into(),
685        ));
686        // AI-NOTE: explicit parallel_tool_calls=true for OpenAI. Anthropic models on OpenRouter
687        // reject this param (404); they use `disable_parallel_tool_use` natively, not exposed here.
688        if !self.model.contains("anthropic/") && !self.model.starts_with("claude") {
689            req = req.parallel_tool_calls(true);
690        }
691
692        let response = self.send_request_auto(req).await?;
693
694        record_otel_usage(&response, &self.model, messages);
695        Self::check_truncation(&response)?;
696
697        let input_tokens = response
698            .usage
699            .as_ref()
700            .and_then(|u| u.input_tokens)
701            .unwrap_or(0);
702        let cached_tokens = response
703            .usage
704            .as_ref()
705            .and_then(|u| u.input_tokens_details.as_ref())
706            .and_then(|d| d.cached_tokens)
707            .unwrap_or(0);
708        let cache_pct = if input_tokens > 0 {
709            (cached_tokens * 100) / input_tokens
710        } else {
711            0
712        };
713
714        if cached_tokens > 0 {
715            eprintln!(
716                "    💰 {}in/{}out (cached: {}, {}%)",
717                input_tokens,
718                response
719                    .usage
720                    .as_ref()
721                    .and_then(|u| u.output_tokens)
722                    .unwrap_or(0),
723                cached_tokens,
724                cache_pct
725            );
726        } else {
727            eprintln!(
728                "    💰 {}in/{}out",
729                input_tokens,
730                response
731                    .usage
732                    .as_ref()
733                    .and_then(|u| u.output_tokens)
734                    .unwrap_or(0)
735            );
736        }
737
738        let calls = Self::extract_tool_calls(&response);
739        Ok(calls)
740    }
741
742    async fn tools_call_stateful(
743        &self,
744        messages: &[Message],
745        tools: &[ToolDef],
746        previous_response_id: Option<&str>,
747    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
748        self.tools_call_stateful_impl(messages, tools, previous_response_id)
749            .await
750    }
751
752    /// tool_choice=auto so model can emit reasoning text ALONGSIDE tool calls in one response.
753    /// Returns (tool_calls, reasoning_text). Used by single-phase agent to get 1 LLM call/step.
754    async fn tools_call_with_text(
755        &self,
756        messages: &[Message],
757        tools: &[ToolDef],
758    ) -> Result<(Vec<ToolCall>, String), SgrError> {
759        let mut req = self.build_request(messages, None, None);
760        req = req.store(true);
761
762        let response_tools: Vec<ResponseTool> = tools
763            .iter()
764            .map(|t| ResponseTool::Function {
765                name: t.name.clone(),
766                description: if t.description.is_empty() {
767                    None
768                } else {
769                    Some(t.description.clone())
770                },
771                parameters: Some(t.parameters.clone()),
772                strict: None,
773            })
774            .collect();
775        req = req.tools(response_tools);
776        // AI-NOTE: tool_choice=auto (not required) — model can return text+tools in same response.
777        // This is the key for single-phase: reasoning in text, action in tool calls, 1 LLM call.
778        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
779            "auto".into(),
780        ));
781        req = req.parallel_tool_calls(true);
782
783        let response = self.send_request_auto(req).await?;
784
785        record_otel_usage(&response, &self.model, messages);
786        Self::check_truncation(&response)?;
787
788        let input_tokens = response
789            .usage
790            .as_ref()
791            .and_then(|u| u.input_tokens)
792            .unwrap_or(0);
793        let cached_tokens = response
794            .usage
795            .as_ref()
796            .and_then(|u| u.input_tokens_details.as_ref())
797            .and_then(|d| d.cached_tokens)
798            .unwrap_or(0);
799        let cache_pct = if input_tokens > 0 {
800            (cached_tokens * 100) / input_tokens
801        } else {
802            0
803        };
804        let output_tokens = response
805            .usage
806            .as_ref()
807            .and_then(|u| u.output_tokens)
808            .unwrap_or(0);
809        if cached_tokens > 0 {
810            eprintln!(
811                "    💰 {}in/{}out (cached: {}, {}%)",
812                input_tokens, output_tokens, cached_tokens, cache_pct
813            );
814        } else {
815            eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
816        }
817
818        let text = response.output_text();
819        let calls = Self::extract_tool_calls(&response);
820        Ok((calls, text))
821    }
822
823    async fn complete(&self, messages: &[Message]) -> Result<String, SgrError> {
824        let mut req = self.build_request(messages, None, None);
825        req = req.store(true);
826
827        let response = self.send_request_auto(req).await?;
828
829        record_otel_usage(&response, &self.model, messages);
830        Self::check_truncation(&response)?;
831
832        let text = response.output_text();
833        if text.is_empty() {
834            return Err(SgrError::EmptyResponse);
835        }
836
837        tracing::info!(
838            model = %response.model,
839            response_id = %response.id,
840            input_tokens = response.usage.as_ref().and_then(|u| u.input_tokens).unwrap_or(0),
841            output_tokens = response.usage.as_ref().and_then(|u| u.output_tokens).unwrap_or(0),
842            "oxide.complete"
843        );
844
845        Ok(text)
846    }
847}
848
849#[cfg(test)]
850mod tests {
851    use super::*;
852    use crate::types::ImagePart;
853
854    #[test]
855    fn oxide_client_from_config() {
856        // Just test construction doesn't panic
857        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
858        let client = OxideClient::from_config(&config).unwrap();
859        assert_eq!(client.model, "gpt-5.4");
860    }
861
862    #[test]
863    fn build_request_simple() {
864        let config = LlmConfig::with_key("sk-test", "gpt-4o-mini").temperature(0.5);
865        let client = OxideClient::from_config(&config).unwrap();
866        let messages = vec![Message::system("Be helpful."), Message::user("Hello")];
867        let req = client.build_request(&messages, None, None);
868        assert_eq!(req.model, "gpt-4o-mini");
869        assert!(req.instructions.is_none());
870        assert!(req.input.is_some());
871        assert_eq!(req.temperature, Some(0.5));
872    }
873
874    #[test]
875    fn temperature_skipped_for_reasoning_models() {
876        // gpt-5*, o1, o3, o4 reject the `temperature` param on the Responses API.
877        for model in [
878            "gpt-5",
879            "gpt-5-mini",
880            "gpt-5.4",
881            "gpt-5.5",
882            "o1-mini",
883            "o3",
884            "o4-mini",
885        ] {
886            let config = LlmConfig::with_key("sk-test", model).temperature(0.7);
887            let client = OxideClient::from_config(&config).unwrap();
888            let req = client.build_request(&[Message::user("Hi")], None, None);
889            assert_eq!(
890                req.temperature, None,
891                "temperature must be omitted for reasoning model `{model}`"
892            );
893        }
894    }
895
896    #[test]
897    fn model_supports_temperature_classification() {
898        assert!(model_supports_temperature("gpt-4o"));
899        assert!(model_supports_temperature("gpt-4o-mini"));
900        assert!(model_supports_temperature("gpt-4.1"));
901        assert!(model_supports_temperature("anthropic/claude-sonnet-4.6"));
902        assert!(!model_supports_temperature("gpt-5"));
903        assert!(!model_supports_temperature("gpt-5-mini"));
904        assert!(!model_supports_temperature("gpt-5.4"));
905        assert!(!model_supports_temperature("gpt-5.5"));
906        assert!(!model_supports_temperature("openai/gpt-5.5"));
907        assert!(!model_supports_temperature("o1-preview"));
908        assert!(!model_supports_temperature("o3-mini"));
909        assert!(!model_supports_temperature("o4-mini"));
910    }
911
912    #[test]
913    fn build_request_with_schema() {
914        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
915        let client = OxideClient::from_config(&config).unwrap();
916        let schema = serde_json::json!({
917            "type": "object",
918            "properties": {"answer": {"type": "string"}},
919            "required": ["answer"]
920        });
921        let req = client.build_request(&[Message::user("Hi")], Some(&schema), None);
922        assert!(req.text.is_some());
923    }
924
925    #[test]
926    fn build_request_stateless_no_previous_response_id() {
927        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
928        let client = OxideClient::from_config(&config).unwrap();
929
930        let req = client.build_request(&[Message::user("Hi")], None, None);
931        assert!(
932            req.previous_response_id.is_none(),
933            "build_request must be stateless when no explicit ID"
934        );
935    }
936
937    #[test]
938    fn build_request_explicit_chaining() {
939        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
940        let client = OxideClient::from_config(&config).unwrap();
941
942        // With previous_response_id — uses Items format for chaining
943        let req = client.build_request(&[Message::user("Hi")], None, Some("resp_xyz"));
944        assert_eq!(
945            req.previous_response_id.as_deref(),
946            Some("resp_xyz"),
947            "build_request should chain with explicit previous_response_id"
948        );
949    }
950
951    #[test]
952    fn build_request_tool_outputs_chaining() {
953        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
954        let client = OxideClient::from_config(&config).unwrap();
955
956        // With previous_response_id — tool outputs as function_call_output items
957        let messages = vec![Message::tool("call_1", "result data")];
958        let req = client.build_request(&messages, None, Some("resp_123"));
959        assert_eq!(req.previous_response_id.as_deref(), Some("resp_123"));
960
961        // Without previous_response_id
962        let req = client.build_request(&messages, None, None);
963        assert!(
964            req.previous_response_id.is_none(),
965            "build_request must be stateless when no explicit ID"
966        );
967    }
968
969    #[test]
970    fn build_request_multimodal_user() {
971        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
972        let client = OxideClient::from_config(&config).unwrap();
973        let img = ImagePart {
974            data: "AAAA".into(),
975            mime_type: "image/jpeg".into(),
976        };
977        let messages = vec![Message::user_with_images("Describe this", vec![img])];
978        let req = client.build_request(&messages, None, None);
979
980        // Single user with images must serialize as content-parts array, not string
981        let input = req.input.as_ref().expect("input missing");
982        let serialized = serde_json::to_value(input).unwrap();
983        let s = serde_json::to_string(&serialized).unwrap();
984        assert!(s.contains("input_text"), "missing input_text part: {s}");
985        assert!(s.contains("input_image"), "missing input_image part: {s}");
986        assert!(
987            s.contains("data:image/jpeg;base64,AAAA"),
988            "missing data URL: {s}"
989        );
990    }
991
992    #[test]
993    fn build_request_items_multimodal_user() {
994        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
995        let client = OxideClient::from_config(&config).unwrap();
996        let img = ImagePart {
997            data: "BBBB".into(),
998            mime_type: "image/png".into(),
999        };
1000        let messages = vec![Message::user_with_images("What's on screen?", vec![img])];
1001        // previous_response_id triggers items-format path
1002        let req = client.build_request(&messages, None, Some("resp_prev"));
1003
1004        let input = req.input.as_ref().expect("input missing");
1005        let s = serde_json::to_string(input).unwrap();
1006        assert!(
1007            s.contains("input_text"),
1008            "items path missing input_text: {s}"
1009        );
1010        assert!(
1011            s.contains("input_image"),
1012            "items path missing input_image: {s}"
1013        );
1014        assert!(
1015            s.contains("data:image/png;base64,BBBB"),
1016            "items path missing data URL: {s}"
1017        );
1018    }
1019}