Skip to main content

sgr_agent/
oxide_client.rs

1//! OxideClient — LlmClient adapter for `openai-oxide` crate.
2//!
3//! Uses the **Responses API** (`POST /responses`) instead of Chat Completions.
4//! With `oxide-ws` feature: persistent WebSocket connection for -20-25% latency.
5//! Supports: structured output (json_schema), function calling, multi-turn (previous_response_id).
6
7use crate::client::LlmClient;
8use crate::multimodal;
9use crate::tool::ToolDef;
10use crate::types::{LlmConfig, Message, Role, SgrError, ToolCall};
11use openai_oxide::OpenAI;
12use openai_oxide::config::ClientConfig;
13use openai_oxide::types::responses::*;
14use serde_json::Value;
15
16/// Record OTEL span for Responses API call via shared telemetry helper.
17#[cfg(feature = "telemetry")]
18fn record_otel_usage(response: &Response, model: &str, messages: &[Message]) {
19    let pt = response
20        .usage
21        .as_ref()
22        .and_then(|u| u.input_tokens)
23        .unwrap_or(0);
24    let ct = response
25        .usage
26        .as_ref()
27        .and_then(|u| u.output_tokens)
28        .unwrap_or(0);
29    let cached = response
30        .usage
31        .as_ref()
32        .and_then(|u| u.input_tokens_details.as_ref())
33        .and_then(|d| d.cached_tokens)
34        .unwrap_or(0);
35
36    let input = last_user_content(messages, 500);
37    let output_text = response.output_text();
38    let output = truncate_str(&output_text, 500);
39    let tool_calls: Vec<(String, String)> = response
40        .function_calls()
41        .iter()
42        .map(|fc| (fc.name.clone(), fc.arguments.to_string()))
43        .collect();
44
45    crate::telemetry::record_llm_span(
46        "oxide.responses.api",
47        model,
48        &input,
49        &output,
50        &tool_calls,
51        &crate::telemetry::LlmUsage {
52            prompt_tokens: pt,
53            completion_tokens: ct,
54            cached_tokens: cached,
55            response_model: response.model.clone(),
56        },
57    );
58}
59
60#[cfg(not(feature = "telemetry"))]
61fn record_otel_usage(_response: &Response, _model: &str, _messages: &[Message]) {}
62
63#[cfg(feature = "telemetry")]
64fn last_user_content(messages: &[Message], max_len: usize) -> String {
65    messages
66        .iter()
67        .rev()
68        .find(|m| matches!(m.role, Role::User | Role::Tool))
69        .map(|m| truncate_str(&m.content, max_len))
70        .unwrap_or_default()
71}
72
73#[cfg(feature = "telemetry")]
74fn truncate_str(s: &str, max_len: usize) -> String {
75    use crate::str_ext::StrExt;
76    let t = s.trunc(max_len);
77    if t.len() < s.len() {
78        format!("{t}...")
79    } else {
80        s.to_string()
81    }
82}
83
84/// LlmClient backed by openai-oxide (Responses API).
85///
86/// With `oxide-ws` feature: call `connect_ws()` to upgrade to WebSocket mode.
87/// All subsequent calls go over persistent wss:// connection (-20-25% latency).
88pub struct OxideClient {
89    client: OpenAI,
90    pub(crate) model: String,
91    pub(crate) temperature: Option<f64>,
92    pub(crate) max_tokens: Option<u32>,
93    /// `text.verbosity` for Responses API ("low" | "medium" | "high").
94    /// `None` = let the API default apply.
95    pub(crate) verbosity: Option<String>,
96    /// WebSocket session (when oxide-ws feature is enabled and connected).
97    #[cfg(feature = "oxide-ws")]
98    ws: tokio::sync::Mutex<Option<openai_oxide::websocket::WsSession>>,
99    /// Lazy WS: true = connect on first request, false = HTTP only.
100    #[cfg(feature = "oxide-ws")]
101    ws_enabled: std::sync::atomic::AtomicBool,
102}
103
104impl OxideClient {
105    /// Create from LlmConfig.
106    pub fn from_config(config: &LlmConfig) -> Result<Self, SgrError> {
107        let api_key = config
108            .api_key
109            .clone()
110            .or_else(|| std::env::var("OPENAI_API_KEY").ok())
111            .unwrap_or_else(|| {
112                if config.base_url.is_some() {
113                    "dummy_key".into()
114                } else {
115                    "".into()
116                }
117            });
118
119        if api_key.is_empty() {
120            return Err(SgrError::Schema("No API key for oxide client".into()));
121        }
122
123        let mut client_config = ClientConfig::new(&api_key);
124        if let Some(ref url) = config.base_url {
125            client_config = client_config.base_url(url.clone());
126        }
127        config.apply_headers(&mut client_config);
128
129        Ok(Self {
130            client: OpenAI::with_config(client_config),
131            model: config.model.clone(),
132            temperature: Some(config.temp),
133            max_tokens: config.max_tokens,
134            verbosity: config.verbosity.clone(),
135            #[cfg(feature = "oxide-ws")]
136            ws: tokio::sync::Mutex::new(None),
137            #[cfg(feature = "oxide-ws")]
138            ws_enabled: std::sync::atomic::AtomicBool::new(false),
139        })
140    }
141
142    /// Enable WebSocket mode — lazy connect on first request.
143    ///
144    /// Does NOT open a connection immediately. The WS connection is established
145    /// on the first `send_request_auto()` call, eliminating idle timeout issues.
146    /// Falls back to HTTP automatically if WS fails.
147    ///
148    /// Requires `oxide-ws` feature.
149    #[cfg(feature = "oxide-ws")]
150    pub async fn connect_ws(&self) -> Result<(), SgrError> {
151        self.ws_enabled
152            .store(true, std::sync::atomic::Ordering::Relaxed);
153        tracing::info!(model = %self.model, "oxide WebSocket enabled (lazy connect)");
154        Ok(())
155    }
156
157    /// Send request — lazy WS connect + send, falls back to HTTP on any WS error.
158    async fn send_request_auto(
159        &self,
160        request: ResponseCreateRequest,
161    ) -> Result<Response, SgrError> {
162        #[cfg(feature = "oxide-ws")]
163        if self.ws_enabled.load(std::sync::atomic::Ordering::Relaxed) {
164            let mut ws_guard = self.ws.lock().await;
165
166            // Lazy connect
167            if ws_guard.is_none() {
168                match self.client.ws_session().await {
169                    Ok(session) => {
170                        tracing::info!(model = %self.model, "oxide WS connected (lazy)");
171                        *ws_guard = Some(session);
172                    }
173                    Err(e) => {
174                        tracing::warn!("oxide WS connect failed, using HTTP: {e}");
175                        self.ws_enabled
176                            .store(false, std::sync::atomic::Ordering::Relaxed);
177                    }
178                }
179            }
180
181            if let Some(ref mut session) = *ws_guard {
182                match session.send(request.clone()).await {
183                    Ok(response) => return Ok(response),
184                    Err(e) => {
185                        tracing::warn!("oxide WS send failed, falling back to HTTP: {e}");
186                        *ws_guard = None;
187                    }
188                }
189            }
190        }
191
192        // HTTP fallback
193        self.client
194            .responses()
195            .create(request)
196            .await
197            .map_err(|e| SgrError::Api {
198                status: 0,
199                body: e.to_string(),
200            })
201    }
202
203    /// Build a ResponseCreateRequest from messages + optional schema + optional chaining.
204    ///
205    /// - `previous_response_id` is None: full history as Messages format
206    /// - `previous_response_id` is Some: Items format with function_call_output
207    ///   (required for chaining after tool calls via Responses API)
208    /// - `schema`: optional structured output json_schema config
209    pub(crate) fn build_request(
210        &self,
211        messages: &[Message],
212        schema: Option<&Value>,
213        previous_response_id: Option<&str>,
214    ) -> ResponseCreateRequest {
215        if previous_response_id.is_some() {
216            // Items format: messages + function_call_output items.
217            // HTTP API accepts Messages format (without type), but WS API requires it.
218            // Using Items consistently ensures both HTTP and WS work.
219            return self.build_request_items(messages, previous_response_id);
220        }
221
222        // Messages format: standard request with optional structured output
223        let mut input_items = Vec::new();
224
225        for msg in messages {
226            match msg.role {
227                Role::System => {
228                    input_items.push(ResponseInputItem {
229                        role: openai_oxide::types::common::Role::System,
230                        content: Value::String(msg.content.clone()),
231                    });
232                }
233                Role::User => {
234                    let content = if msg.images.is_empty() {
235                        Value::String(msg.content.clone())
236                    } else {
237                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
238                            .unwrap_or_else(|_| Value::String(msg.content.clone()))
239                    };
240                    input_items.push(ResponseInputItem {
241                        role: openai_oxide::types::common::Role::User,
242                        content,
243                    });
244                }
245                Role::Assistant => {
246                    // Include tool call info so structured_call context shows
247                    // what action was taken.
248                    let mut content = msg.content.clone();
249                    if !msg.tool_calls.is_empty() {
250                        for tc in &msg.tool_calls {
251                            let args = tc.arguments.to_string();
252                            let preview = if args.len() > 200 {
253                                use crate::str_ext::StrExt;
254                                args.trunc(200)
255                            } else {
256                                &args
257                            };
258                            content.push_str(&format!("\n→ {}({})", tc.name, preview));
259                        }
260                    }
261                    input_items.push(ResponseInputItem {
262                        role: openai_oxide::types::common::Role::Assistant,
263                        content: Value::String(content),
264                    });
265                }
266                Role::Tool => {
267                    // Clean format — no "[Tool result for ...]" prefix.
268                    // The assistant message above already has the action name.
269                    input_items.push(ResponseInputItem {
270                        role: openai_oxide::types::common::Role::User,
271                        content: Value::String(msg.content.clone()),
272                    });
273                }
274            }
275        }
276
277        let mut req = ResponseCreateRequest::new(&self.model);
278
279        // Set input — prefer simple text when single user message (fewer tokens)
280        if input_items.len() == 1 && input_items[0].role == openai_oxide::types::common::Role::User
281        {
282            if let Some(text) = input_items[0].content.as_str() {
283                req = req.input(text);
284            } else {
285                req.input = Some(ResponseInput::Messages(input_items));
286            }
287        } else if !input_items.is_empty() {
288            req.input = Some(ResponseInput::Messages(input_items));
289        }
290
291        // Temperature — skip default to reduce payload
292        if let Some(temp) = self.temperature
293            && (temp - 1.0).abs() > f64::EPSILON
294        {
295            req = req.temperature(temp);
296        }
297
298        // Max tokens
299        if let Some(max) = self.max_tokens {
300            req = req.max_output_tokens(max as i64);
301        }
302
303        // Structured output via json_schema (and/or verbosity passthrough)
304        match (schema, self.verbosity.clone()) {
305            (Some(schema_val), v) => {
306                req = req.text(ResponseTextConfig {
307                    format: Some(ResponseTextFormat::JsonSchema {
308                        name: "sgr_response".into(),
309                        description: None,
310                        schema: Some(schema_val.clone()),
311                        strict: Some(true),
312                    }),
313                    verbosity: v,
314                });
315            }
316            (None, Some(v)) => {
317                req = req.text(ResponseTextConfig {
318                    format: None,
319                    verbosity: Some(v),
320                });
321            }
322            (None, None) => {}
323        }
324
325        req
326    }
327
328    /// Build Items-format request for stateful chaining with previous_response_id.
329    fn build_request_items(
330        &self,
331        messages: &[Message],
332        previous_response_id: Option<&str>,
333    ) -> ResponseCreateRequest {
334        use openai_oxide::types::responses::ResponseInput;
335
336        let mut items: Vec<Value> = Vec::new();
337
338        for msg in messages {
339            match msg.role {
340                Role::Tool => {
341                    if let Some(ref call_id) = msg.tool_call_id {
342                        items.push(serde_json::json!({
343                            "type": "function_call_output",
344                            "call_id": call_id,
345                            "output": msg.content
346                        }));
347                    }
348                }
349                Role::System => {
350                    items.push(serde_json::json!({
351                        "type": "message",
352                        "role": "system",
353                        "content": msg.content
354                    }));
355                }
356                Role::User => {
357                    let content = if msg.images.is_empty() {
358                        serde_json::json!(msg.content)
359                    } else {
360                        serde_json::to_value(multimodal::responses_parts(&msg.content, &msg.images))
361                            .unwrap_or_else(|_| serde_json::json!(msg.content))
362                    };
363                    items.push(serde_json::json!({
364                        "type": "message",
365                        "role": "user",
366                        "content": content,
367                    }));
368                }
369                Role::Assistant => {
370                    items.push(serde_json::json!({
371                        "type": "message",
372                        "role": "assistant",
373                        "content": msg.content
374                    }));
375                }
376            }
377        }
378
379        let mut req = ResponseCreateRequest::new(&self.model);
380        if !items.is_empty() {
381            req.input = Some(ResponseInput::Items(items));
382        }
383
384        // Temperature
385        if let Some(temp) = self.temperature
386            && (temp - 1.0).abs() > f64::EPSILON
387        {
388            req = req.temperature(temp);
389        }
390        if let Some(max) = self.max_tokens {
391            req = req.max_output_tokens(max as i64);
392        }
393
394        if let Some(v) = self.verbosity.clone() {
395            req = req.text(ResponseTextConfig {
396                format: None,
397                verbosity: Some(v),
398            });
399        }
400
401        if let Some(prev_id) = previous_response_id {
402            req = req.previous_response_id(prev_id);
403        }
404
405        req
406    }
407
408    /// Function calling with explicit previous_response_id.
409    /// Returns tool calls + new response_id for chaining.
410    ///
411    /// Always sets `store(true)` so responses can be referenced by subsequent calls.
412    /// When `previous_response_id` is provided, only delta messages need to be sent
413    /// (server has full history from previous stored response).
414    ///
415    /// Tool messages (role=Tool with tool_call_id) are converted to Responses API
416    /// `function_call_output` items — required for chaining with previous_response_id.
417    ///
418    /// This method does NOT use the Mutex — all state is explicit via parameters/return.
419    async fn tools_call_stateful_impl(
420        &self,
421        messages: &[Message],
422        tools: &[ToolDef],
423        previous_response_id: Option<&str>,
424    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
425        let mut req = self.build_request(messages, None, previous_response_id);
426        // Always store so next call can chain via previous_response_id
427        req = req.store(true);
428
429        // Convert ToolDefs to ResponseTools with strict mode.
430        // strict: true guarantees LLM output matches schema exactly (no parse errors).
431        // oxide ensure_strict() handles: additionalProperties, all-required,
432        // nullable→anyOf, allOf inlining, oneOf→anyOf.
433        let response_tools: Vec<ResponseTool> = tools
434            .iter()
435            .map(|t| {
436                let mut params = t.parameters.clone();
437                openai_oxide::parsing::ensure_strict(&mut params);
438                ResponseTool::Function {
439                    name: t.name.clone(),
440                    description: if t.description.is_empty() {
441                        None
442                    } else {
443                        Some(t.description.clone())
444                    },
445                    parameters: Some(params),
446                    strict: Some(true),
447                }
448            })
449            .collect();
450        req = req.tools(response_tools);
451
452        let response = self.send_request_auto(req).await?;
453
454        let response_id = response.id.clone();
455        // No Mutex save — caller owns the response_id
456        record_otel_usage(&response, &self.model, messages);
457
458        let input_tokens = response
459            .usage
460            .as_ref()
461            .and_then(|u| u.input_tokens)
462            .unwrap_or(0);
463        let cached_tokens = response
464            .usage
465            .as_ref()
466            .and_then(|u| u.input_tokens_details.as_ref())
467            .and_then(|d| d.cached_tokens)
468            .unwrap_or(0);
469
470        let chained = previous_response_id.is_some();
471        let cache_pct = if input_tokens > 0 {
472            (cached_tokens * 100) / input_tokens
473        } else {
474            0
475        };
476
477        tracing::info!(
478            model = %response.model,
479            response_id = %response_id,
480            input_tokens,
481            cached_tokens,
482            cache_pct,
483            chained,
484            "oxide.tools_call_stateful"
485        );
486
487        if cached_tokens > 0 {
488            eprintln!(
489                "    💰 {}in/{}out (cached: {}, {}%)",
490                input_tokens,
491                response
492                    .usage
493                    .as_ref()
494                    .and_then(|u| u.output_tokens)
495                    .unwrap_or(0),
496                cached_tokens,
497                cache_pct
498            );
499        } else {
500            eprintln!(
501                "    💰 {}in/{}out",
502                input_tokens,
503                response
504                    .usage
505                    .as_ref()
506                    .and_then(|u| u.output_tokens)
507                    .unwrap_or(0)
508            );
509        }
510
511        Self::check_truncation(&response)?;
512        Ok((Self::extract_tool_calls(&response), Some(response_id)))
513    }
514
515    /// Check if response was truncated due to max_output_tokens.
516    /// Returns Err(MaxOutputTokens) if truncated, Ok(()) otherwise.
517    fn check_truncation(response: &Response) -> Result<(), SgrError> {
518        let is_incomplete = response
519            .status
520            .as_deref()
521            .is_some_and(|s| s == "incomplete");
522        let is_max_tokens = response
523            .incomplete_details
524            .as_ref()
525            .and_then(|d| d.reason.as_deref())
526            .is_some_and(|r| r == "max_output_tokens");
527
528        if is_incomplete && is_max_tokens {
529            return Err(SgrError::MaxOutputTokens {
530                partial_content: response.output_text(),
531            });
532        }
533        Ok(())
534    }
535
536    /// Extract tool calls from Responses API output items.
537    fn extract_tool_calls(response: &Response) -> Vec<ToolCall> {
538        response
539            .function_calls()
540            .into_iter()
541            .map(|fc| ToolCall {
542                id: fc.call_id,
543                name: fc.name,
544                arguments: fc.arguments,
545            })
546            .collect()
547    }
548}
549
550#[async_trait::async_trait]
551impl LlmClient for OxideClient {
552    async fn structured_call(
553        &self,
554        messages: &[Message],
555        schema: &Value,
556    ) -> Result<(Option<Value>, Vec<ToolCall>, String), SgrError> {
557        // Make schema OpenAI-strict — UNLESS it's already strict
558        // (build_action_schema produces pre-strict schemas that ensure_strict would break)
559        let strict_schema =
560            if schema.get("additionalProperties").and_then(|v| v.as_bool()) == Some(false) {
561                // Already strict-compatible (e.g., from build_action_schema)
562                schema.clone()
563            } else {
564                let mut s = schema.clone();
565                openai_oxide::parsing::ensure_strict(&mut s);
566                s
567            };
568
569        // Stateless — build request with full message history, no chaining.
570        // store(true) enables server-side prompt caching for stable prefix.
571        let mut req = self.build_request(messages, Some(&strict_schema), None);
572        req = req.store(true);
573
574        let span = tracing::info_span!(
575            "oxide.responses.create",
576            model = %self.model,
577            method = "structured_call",
578        );
579        let _enter = span.enter();
580
581        // Debug: dump schema on first call
582        if std::env::var("SGR_DEBUG_SCHEMA").is_ok()
583            && let Some(ref text_cfg) = req.text
584        {
585            eprintln!(
586                "[sgr] Schema: {}",
587                serde_json::to_string(text_cfg).unwrap_or_default()
588            );
589        }
590
591        let response = self.send_request_auto(req).await?;
592
593        // No Mutex save — structured_call is stateless
594        record_otel_usage(&response, &self.model, messages);
595
596        Self::check_truncation(&response)?;
597
598        let raw_text = response.output_text();
599        if std::env::var("SGR_DEBUG").is_ok() {
600            eprintln!("[sgr] Raw response: {}", {
601                use crate::str_ext::StrExt;
602                raw_text.trunc(500)
603            });
604        }
605        let tool_calls = Self::extract_tool_calls(&response);
606        let parsed = serde_json::from_str::<Value>(&raw_text).ok();
607
608        let input_tokens = response
609            .usage
610            .as_ref()
611            .and_then(|u| u.input_tokens)
612            .unwrap_or(0);
613        let cached_tokens = response
614            .usage
615            .as_ref()
616            .and_then(|u| u.input_tokens_details.as_ref())
617            .and_then(|d| d.cached_tokens)
618            .unwrap_or(0);
619        let cache_pct = if input_tokens > 0 {
620            (cached_tokens * 100) / input_tokens
621        } else {
622            0
623        };
624
625        {
626            let output_tokens = response
627                .usage
628                .as_ref()
629                .and_then(|u| u.output_tokens)
630                .unwrap_or(0);
631            if cached_tokens > 0 {
632                eprintln!(
633                    "    💰 {}in/{}out (cached: {}, {}%)",
634                    input_tokens, output_tokens, cached_tokens, cache_pct
635                );
636            } else {
637                eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
638            }
639        }
640
641        Ok((parsed, tool_calls, raw_text))
642    }
643
644    async fn tools_call(
645        &self,
646        messages: &[Message],
647        tools: &[ToolDef],
648    ) -> Result<Vec<ToolCall>, SgrError> {
649        // Stateless — no previous_response_id, full message history.
650        // store(true) enables server-side prompt caching: OpenAI auto-caches
651        // the stable prefix (system prompt + tools) for requests >1024 tokens.
652        let mut req = self.build_request(messages, None, None);
653        req = req.store(true);
654
655        // Convert ToolDefs to ResponseTools — no strict mode (faster server-side)
656        let response_tools: Vec<ResponseTool> = tools
657            .iter()
658            .map(|t| ResponseTool::Function {
659                name: t.name.clone(),
660                description: if t.description.is_empty() {
661                    None
662                } else {
663                    Some(t.description.clone())
664                },
665                parameters: Some(t.parameters.clone()),
666                strict: None, // AI-NOTE: strict=true breaks tools with optional params
667            })
668            .collect();
669        req = req.tools(response_tools);
670
671        // Force model to always call a tool — prevents text-only responses
672        // that lose answer content (tools_call only returns Vec<ToolCall>).
673        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
674            "required".into(),
675        ));
676        // AI-NOTE: explicit parallel_tool_calls=true for OpenAI. Anthropic models on OpenRouter
677        // reject this param (404); they use `disable_parallel_tool_use` natively, not exposed here.
678        if !self.model.contains("anthropic/") && !self.model.starts_with("claude") {
679            req = req.parallel_tool_calls(true);
680        }
681
682        let response = self.send_request_auto(req).await?;
683
684        record_otel_usage(&response, &self.model, messages);
685        Self::check_truncation(&response)?;
686
687        let input_tokens = response
688            .usage
689            .as_ref()
690            .and_then(|u| u.input_tokens)
691            .unwrap_or(0);
692        let cached_tokens = response
693            .usage
694            .as_ref()
695            .and_then(|u| u.input_tokens_details.as_ref())
696            .and_then(|d| d.cached_tokens)
697            .unwrap_or(0);
698        let cache_pct = if input_tokens > 0 {
699            (cached_tokens * 100) / input_tokens
700        } else {
701            0
702        };
703
704        if cached_tokens > 0 {
705            eprintln!(
706                "    💰 {}in/{}out (cached: {}, {}%)",
707                input_tokens,
708                response
709                    .usage
710                    .as_ref()
711                    .and_then(|u| u.output_tokens)
712                    .unwrap_or(0),
713                cached_tokens,
714                cache_pct
715            );
716        } else {
717            eprintln!(
718                "    💰 {}in/{}out",
719                input_tokens,
720                response
721                    .usage
722                    .as_ref()
723                    .and_then(|u| u.output_tokens)
724                    .unwrap_or(0)
725            );
726        }
727
728        let calls = Self::extract_tool_calls(&response);
729        Ok(calls)
730    }
731
732    async fn tools_call_stateful(
733        &self,
734        messages: &[Message],
735        tools: &[ToolDef],
736        previous_response_id: Option<&str>,
737    ) -> Result<(Vec<ToolCall>, Option<String>), SgrError> {
738        self.tools_call_stateful_impl(messages, tools, previous_response_id)
739            .await
740    }
741
742    /// tool_choice=auto so model can emit reasoning text ALONGSIDE tool calls in one response.
743    /// Returns (tool_calls, reasoning_text). Used by single-phase agent to get 1 LLM call/step.
744    async fn tools_call_with_text(
745        &self,
746        messages: &[Message],
747        tools: &[ToolDef],
748    ) -> Result<(Vec<ToolCall>, String), SgrError> {
749        let mut req = self.build_request(messages, None, None);
750        req = req.store(true);
751
752        let response_tools: Vec<ResponseTool> = tools
753            .iter()
754            .map(|t| ResponseTool::Function {
755                name: t.name.clone(),
756                description: if t.description.is_empty() {
757                    None
758                } else {
759                    Some(t.description.clone())
760                },
761                parameters: Some(t.parameters.clone()),
762                strict: None,
763            })
764            .collect();
765        req = req.tools(response_tools);
766        // AI-NOTE: tool_choice=auto (not required) — model can return text+tools in same response.
767        // This is the key for single-phase: reasoning in text, action in tool calls, 1 LLM call.
768        req = req.tool_choice(openai_oxide::types::responses::ResponseToolChoice::Mode(
769            "auto".into(),
770        ));
771        req = req.parallel_tool_calls(true);
772
773        let response = self.send_request_auto(req).await?;
774
775        record_otel_usage(&response, &self.model, messages);
776        Self::check_truncation(&response)?;
777
778        let input_tokens = response
779            .usage
780            .as_ref()
781            .and_then(|u| u.input_tokens)
782            .unwrap_or(0);
783        let cached_tokens = response
784            .usage
785            .as_ref()
786            .and_then(|u| u.input_tokens_details.as_ref())
787            .and_then(|d| d.cached_tokens)
788            .unwrap_or(0);
789        let cache_pct = if input_tokens > 0 {
790            (cached_tokens * 100) / input_tokens
791        } else {
792            0
793        };
794        let output_tokens = response
795            .usage
796            .as_ref()
797            .and_then(|u| u.output_tokens)
798            .unwrap_or(0);
799        if cached_tokens > 0 {
800            eprintln!(
801                "    💰 {}in/{}out (cached: {}, {}%)",
802                input_tokens, output_tokens, cached_tokens, cache_pct
803            );
804        } else {
805            eprintln!("    💰 {}in/{}out", input_tokens, output_tokens);
806        }
807
808        let text = response.output_text();
809        let calls = Self::extract_tool_calls(&response);
810        Ok((calls, text))
811    }
812
813    async fn complete(&self, messages: &[Message]) -> Result<String, SgrError> {
814        let mut req = self.build_request(messages, None, None);
815        req = req.store(true);
816
817        let response = self.send_request_auto(req).await?;
818
819        record_otel_usage(&response, &self.model, messages);
820        Self::check_truncation(&response)?;
821
822        let text = response.output_text();
823        if text.is_empty() {
824            return Err(SgrError::EmptyResponse);
825        }
826
827        tracing::info!(
828            model = %response.model,
829            response_id = %response.id,
830            input_tokens = response.usage.as_ref().and_then(|u| u.input_tokens).unwrap_or(0),
831            output_tokens = response.usage.as_ref().and_then(|u| u.output_tokens).unwrap_or(0),
832            "oxide.complete"
833        );
834
835        Ok(text)
836    }
837}
838
839#[cfg(test)]
840mod tests {
841    use super::*;
842    use crate::types::ImagePart;
843
844    #[test]
845    fn oxide_client_from_config() {
846        // Just test construction doesn't panic
847        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
848        let client = OxideClient::from_config(&config).unwrap();
849        assert_eq!(client.model, "gpt-5.4");
850    }
851
852    #[test]
853    fn build_request_simple() {
854        let config = LlmConfig::with_key("sk-test", "gpt-5.4").temperature(0.5);
855        let client = OxideClient::from_config(&config).unwrap();
856        let messages = vec![Message::system("Be helpful."), Message::user("Hello")];
857        let req = client.build_request(&messages, None, None);
858        assert_eq!(req.model, "gpt-5.4");
859        assert!(req.instructions.is_none());
860        assert!(req.input.is_some());
861        assert_eq!(req.temperature, Some(0.5));
862    }
863
864    #[test]
865    fn build_request_with_schema() {
866        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
867        let client = OxideClient::from_config(&config).unwrap();
868        let schema = serde_json::json!({
869            "type": "object",
870            "properties": {"answer": {"type": "string"}},
871            "required": ["answer"]
872        });
873        let req = client.build_request(&[Message::user("Hi")], Some(&schema), None);
874        assert!(req.text.is_some());
875    }
876
877    #[test]
878    fn build_request_stateless_no_previous_response_id() {
879        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
880        let client = OxideClient::from_config(&config).unwrap();
881
882        let req = client.build_request(&[Message::user("Hi")], None, None);
883        assert!(
884            req.previous_response_id.is_none(),
885            "build_request must be stateless when no explicit ID"
886        );
887    }
888
889    #[test]
890    fn build_request_explicit_chaining() {
891        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
892        let client = OxideClient::from_config(&config).unwrap();
893
894        // With previous_response_id — uses Items format for chaining
895        let req = client.build_request(&[Message::user("Hi")], None, Some("resp_xyz"));
896        assert_eq!(
897            req.previous_response_id.as_deref(),
898            Some("resp_xyz"),
899            "build_request should chain with explicit previous_response_id"
900        );
901    }
902
903    #[test]
904    fn build_request_tool_outputs_chaining() {
905        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
906        let client = OxideClient::from_config(&config).unwrap();
907
908        // With previous_response_id — tool outputs as function_call_output items
909        let messages = vec![Message::tool("call_1", "result data")];
910        let req = client.build_request(&messages, None, Some("resp_123"));
911        assert_eq!(req.previous_response_id.as_deref(), Some("resp_123"));
912
913        // Without previous_response_id
914        let req = client.build_request(&messages, None, None);
915        assert!(
916            req.previous_response_id.is_none(),
917            "build_request must be stateless when no explicit ID"
918        );
919    }
920
921    #[test]
922    fn build_request_multimodal_user() {
923        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
924        let client = OxideClient::from_config(&config).unwrap();
925        let img = ImagePart {
926            data: "AAAA".into(),
927            mime_type: "image/jpeg".into(),
928        };
929        let messages = vec![Message::user_with_images("Describe this", vec![img])];
930        let req = client.build_request(&messages, None, None);
931
932        // Single user with images must serialize as content-parts array, not string
933        let input = req.input.as_ref().expect("input missing");
934        let serialized = serde_json::to_value(input).unwrap();
935        let s = serde_json::to_string(&serialized).unwrap();
936        assert!(s.contains("input_text"), "missing input_text part: {s}");
937        assert!(s.contains("input_image"), "missing input_image part: {s}");
938        assert!(
939            s.contains("data:image/jpeg;base64,AAAA"),
940            "missing data URL: {s}"
941        );
942    }
943
944    #[test]
945    fn build_request_items_multimodal_user() {
946        let config = LlmConfig::with_key("sk-test", "gpt-5.4");
947        let client = OxideClient::from_config(&config).unwrap();
948        let img = ImagePart {
949            data: "BBBB".into(),
950            mime_type: "image/png".into(),
951        };
952        let messages = vec![Message::user_with_images("What's on screen?", vec![img])];
953        // previous_response_id triggers items-format path
954        let req = client.build_request(&messages, None, Some("resp_prev"));
955
956        let input = req.input.as_ref().expect("input missing");
957        let s = serde_json::to_string(input).unwrap();
958        assert!(
959            s.contains("input_text"),
960            "items path missing input_text: {s}"
961        );
962        assert!(
963            s.contains("input_image"),
964            "items path missing input_image: {s}"
965        );
966        assert!(
967            s.contains("data:image/png;base64,BBBB"),
968            "items path missing data URL: {s}"
969        );
970    }
971}