manx_cli/rag/
llm.rs

1//! Multi-provider LLM integration for answer synthesis
2//!
3//! Supports OpenAI GPT, Anthropic Claude, Groq, OpenRouter, HuggingFace, and custom endpoints
4//! with automatic failover and comprehensive error handling.
5
6use anyhow::{anyhow, Result};
7use serde::{Deserialize, Serialize};
8
9use crate::rag::RagSearchResult;
10
11/// Configuration for LLM integration supporting multiple providers
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct LlmConfig {
14    pub openai_api_key: Option<String>,
15    pub anthropic_api_key: Option<String>,
16    pub groq_api_key: Option<String>,
17    pub openrouter_api_key: Option<String>,
18    pub huggingface_api_key: Option<String>,
19    pub custom_endpoint: Option<String>,
20    pub preferred_provider: LlmProvider,
21    pub fallback_providers: Vec<LlmProvider>,
22    pub timeout_seconds: u64,
23    pub max_tokens: u32,
24    pub temperature: f32,
25    pub model_name: Option<String>,
26    pub streaming: bool,
27}
28
29impl Default for LlmConfig {
30    fn default() -> Self {
31        Self {
32            openai_api_key: None,
33            anthropic_api_key: None,
34            groq_api_key: None,
35            openrouter_api_key: None,
36            huggingface_api_key: None,
37            custom_endpoint: None,
38            preferred_provider: LlmProvider::Auto,
39            fallback_providers: vec![
40                LlmProvider::OpenAI,
41                LlmProvider::Anthropic,
42                LlmProvider::Groq,
43                LlmProvider::OpenRouter,
44            ],
45            timeout_seconds: 30,
46            max_tokens: 1000,
47            temperature: 0.1,
48            model_name: None,
49            streaming: false,
50        }
51    }
52}
53
54/// Available LLM providers
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub enum LlmProvider {
57    Auto,
58    OpenAI,
59    Anthropic,
60    Groq,
61    OpenRouter,
62    HuggingFace,
63    Custom,
64}
65
66/// LLM response with comprehensive metadata
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct LlmResponse {
69    pub answer: String,
70    pub sources_used: Vec<String>,
71    pub confidence: Option<f32>,
72    pub provider_used: LlmProvider,
73    pub model_used: String,
74    pub tokens_used: Option<u32>,
75    pub response_time_ms: u64,
76    pub finish_reason: Option<String>,
77    pub citations: Vec<Citation>,
78}
79
80/// Citation information linking to source documents
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct Citation {
83    pub source_id: String,
84    pub source_title: String,
85    pub source_url: Option<String>,
86    pub relevance_score: f32,
87    pub excerpt: String,
88}
89
90/// Multi-provider LLM client with automatic failover
91pub struct LlmClient {
92    pub(crate) config: LlmConfig,
93    pub(crate) http_client: reqwest::Client,
94}
95
96impl LlmClient {
97    /// Create a new LLM client with configuration
98    pub fn new(config: LlmConfig) -> Result<Self> {
99        let http_client = reqwest::Client::builder()
100            .timeout(std::time::Duration::from_secs(config.timeout_seconds))
101            .build()?;
102
103        Ok(Self {
104            config,
105            http_client,
106        })
107    }
108
109    /// Check if any LLM provider is available
110    pub fn is_available(&self) -> bool {
111        self.has_openai_key()
112            || self.has_anthropic_key()
113            || self.has_groq_key()
114            || self.has_openrouter_key()
115            || self.has_huggingface_key()
116            || self.config.custom_endpoint.is_some()
117    }
118
119    /// Check availability of specific providers
120    pub fn has_openai_key(&self) -> bool {
121        self.config
122            .openai_api_key
123            .as_ref()
124            .is_some_and(|key| !key.is_empty())
125    }
126
127    pub fn has_anthropic_key(&self) -> bool {
128        self.config
129            .anthropic_api_key
130            .as_ref()
131            .is_some_and(|key| !key.is_empty())
132    }
133
134    pub fn has_groq_key(&self) -> bool {
135        self.config
136            .groq_api_key
137            .as_ref()
138            .is_some_and(|key| !key.is_empty())
139    }
140
141    pub fn has_openrouter_key(&self) -> bool {
142        self.config
143            .openrouter_api_key
144            .as_ref()
145            .is_some_and(|key| !key.is_empty())
146    }
147
148    pub fn has_huggingface_key(&self) -> bool {
149        self.config
150            .huggingface_api_key
151            .as_ref()
152            .is_some_and(|key| !key.is_empty())
153    }
154
155    /// Get the best available provider based on configuration and API key availability
156    pub fn get_best_provider(&self) -> Option<LlmProvider> {
157        if self.config.preferred_provider != LlmProvider::Auto {
158            // Check if preferred provider is available
159            if self.is_provider_available(&self.config.preferred_provider) {
160                return Some(self.config.preferred_provider.clone());
161            }
162        }
163
164        // Try fallback providers in order
165        for provider in &self.config.fallback_providers {
166            if self.is_provider_available(provider) {
167                return Some(provider.clone());
168            }
169        }
170
171        None
172    }
173
174    /// Check if a specific provider is available
175    pub fn is_provider_available(&self, provider: &LlmProvider) -> bool {
176        match provider {
177            LlmProvider::OpenAI => self.has_openai_key(),
178            LlmProvider::Anthropic => self.has_anthropic_key(),
179            LlmProvider::Groq => self.has_groq_key(),
180            LlmProvider::OpenRouter => self.has_openrouter_key(),
181            LlmProvider::HuggingFace => self.has_huggingface_key(),
182            LlmProvider::Custom => self.config.custom_endpoint.is_some(),
183            LlmProvider::Auto => false, // Auto is not a real provider
184        }
185    }
186
187    /// Synthesize an answer from search results using the best available provider
188    pub async fn synthesize_answer(
189        &self,
190        query: &str,
191        results: &[RagSearchResult],
192    ) -> Result<LlmResponse> {
193        let provider = self
194            .get_best_provider()
195            .ok_or_else(|| anyhow!("No LLM provider available"))?;
196
197        let start_time = std::time::Instant::now();
198
199        let response = match provider {
200            LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
201            LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
202            LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
203            LlmProvider::OpenRouter => self.synthesize_with_openrouter(query, results).await,
204            LlmProvider::HuggingFace => self.synthesize_with_huggingface(query, results).await,
205            LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
206            LlmProvider::Auto => unreachable!(),
207        };
208
209        // If primary provider fails, try fallback providers
210        match response {
211            Ok(mut resp) => {
212                resp.response_time_ms = start_time.elapsed().as_millis() as u64;
213                Ok(resp)
214            }
215            Err(e) => {
216                log::warn!("Primary provider {:?} failed: {}", provider, e);
217                self.try_fallback_providers(query, results, &provider).await
218            }
219        }
220    }
221
222    /// Try fallback providers if primary fails
223    async fn try_fallback_providers(
224        &self,
225        query: &str,
226        results: &[RagSearchResult],
227        failed_provider: &LlmProvider,
228    ) -> Result<LlmResponse> {
229        for provider in &self.config.fallback_providers {
230            if provider != failed_provider && self.is_provider_available(provider) {
231                log::info!("Trying fallback provider: {:?}", provider);
232
233                let start_time = std::time::Instant::now();
234                let response = match provider {
235                    LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
236                    LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
237                    LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
238                    LlmProvider::OpenRouter => {
239                        self.synthesize_with_openrouter(query, results).await
240                    }
241                    LlmProvider::HuggingFace => {
242                        self.synthesize_with_huggingface(query, results).await
243                    }
244                    LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
245                    LlmProvider::Auto => continue,
246                };
247
248                if let Ok(mut resp) = response {
249                    resp.response_time_ms = start_time.elapsed().as_millis() as u64;
250                    return Ok(resp);
251                }
252            }
253        }
254
255        Err(anyhow!("All LLM providers failed"))
256    }
257
258    /// Get the appropriate model name for a provider
259    fn get_model_name(&self, provider: &LlmProvider) -> String {
260        if let Some(model) = &self.config.model_name {
261            return model.clone();
262        }
263
264        match provider {
265            LlmProvider::OpenAI => "gpt-4o-mini".to_string(),
266            LlmProvider::Anthropic => "claude-3-haiku-20240307".to_string(),
267            LlmProvider::Groq => "llama-3.1-8b-instant".to_string(),
268            LlmProvider::OpenRouter => "openai/gpt-3.5-turbo".to_string(),
269            LlmProvider::HuggingFace => "microsoft/DialoGPT-medium".to_string(),
270            LlmProvider::Custom => "custom-model".to_string(),
271            LlmProvider::Auto => "auto".to_string(),
272        }
273    }
274
275    /// Create concise system prompt focused on clean, scannable output
276    fn create_system_prompt(&self) -> String {
277        r#"You are a concise technical documentation assistant. Provide clear, scannable answers based ONLY on the provided search results.
278
279RESPONSE FORMAT:
2801. **Quick Answer** (1-2 sentences max)
2812. **Key Points** (bullet points, max 4 items)  
2823. **Code Example** (if available - keep it short and practical)
283
284RULES:
285- Be extremely concise and scannable
286- Use bullet points and short paragraphs
287- Only include essential information
288- Cite sources as [Source N] 
289- Never add information not in the sources
290- Focus on what developers need to know immediately
291
292STYLE:
293- Write for busy developers who want quick answers
294- Use clear, simple language
295- Keep code examples minimal but complete
296- Prioritize readability over completeness"#.to_string()
297    }
298
299    /// Create user prompt with query and search results
300    fn create_user_prompt(&self, query: &str, results: &[RagSearchResult]) -> String {
301        let mut prompt = format!("Question: {}\n\nSearch Results:\n\n", query);
302
303        for (i, result) in results.iter().enumerate() {
304            prompt.push_str(&format!(
305                "[Source {}] {}\nURL: {}\nContent: {}\n\n",
306                i + 1,
307                result.title.as_ref().unwrap_or(&"Untitled".to_string()),
308                result.source_path.to_string_lossy(),
309                result.content.chars().take(1000).collect::<String>()
310            ));
311        }
312
313        prompt.push_str("\nPlease provide a comprehensive answer based on these search results.");
314        prompt
315    }
316
317    /// Extract citations from LLM response
318    fn extract_citations(&self, response_text: &str, results: &[RagSearchResult]) -> Vec<Citation> {
319        let mut citations = Vec::new();
320
321        // Simple citation extraction - look for [Source N] patterns
322        for (i, result) in results.iter().enumerate() {
323            let source_ref = format!("[Source {}]", i + 1);
324            if response_text.contains(&source_ref) {
325                citations.push(Citation {
326                    source_id: result.id.clone(),
327                    source_title: result
328                        .title
329                        .clone()
330                        .unwrap_or_else(|| "Untitled".to_string()),
331                    source_url: Some(result.source_path.to_string_lossy().to_string()),
332                    relevance_score: result.score,
333                    excerpt: result.content.chars().take(200).collect(),
334                });
335            }
336        }
337
338        citations
339    }
340
341    /// OpenAI GPT integration with streaming support
342    async fn synthesize_with_openai(
343        &self,
344        query: &str,
345        results: &[RagSearchResult],
346    ) -> Result<LlmResponse> {
347        let api_key = self
348            .config
349            .openai_api_key
350            .as_ref()
351            .ok_or_else(|| anyhow!("OpenAI API key not configured"))?;
352
353        let model = self.get_model_name(&LlmProvider::OpenAI);
354        let system_prompt = self.create_system_prompt();
355        let user_prompt = self.create_user_prompt(query, results);
356
357        let payload = serde_json::json!({
358            "model": model,
359            "messages": [
360                {
361                    "role": "system",
362                    "content": system_prompt
363                },
364                {
365                    "role": "user",
366                    "content": user_prompt
367                }
368            ],
369            "max_tokens": self.config.max_tokens,
370            "temperature": self.config.temperature,
371            "stream": self.config.streaming
372        });
373
374        let response = self
375            .http_client
376            .post("https://api.openai.com/v1/chat/completions")
377            .header("Authorization", format!("Bearer {}", api_key))
378            .header("Content-Type", "application/json")
379            .json(&payload)
380            .send()
381            .await?;
382
383        if !response.status().is_success() {
384            let error_text = response.text().await?;
385            return Err(anyhow!("OpenAI API error: {}", error_text));
386        }
387
388        let response_json: serde_json::Value = response.json().await?;
389
390        let answer = response_json["choices"][0]["message"]["content"]
391            .as_str()
392            .ok_or_else(|| anyhow!("Invalid OpenAI response format"))?
393            .to_string();
394
395        let usage = &response_json["usage"];
396        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
397        let finish_reason = response_json["choices"][0]["finish_reason"]
398            .as_str()
399            .map(|s| s.to_string());
400
401        let citations = self.extract_citations(&answer, results);
402
403        Ok(LlmResponse {
404            answer,
405            sources_used: results.iter().map(|r| r.id.clone()).collect(),
406            confidence: Some(0.9), // OpenAI typically high confidence
407            provider_used: LlmProvider::OpenAI,
408            model_used: model,
409            tokens_used,
410            response_time_ms: 0, // Will be set by caller
411            finish_reason,
412            citations,
413        })
414    }
415
416    /// Anthropic Claude integration with function calling support
417    async fn synthesize_with_anthropic(
418        &self,
419        query: &str,
420        results: &[RagSearchResult],
421    ) -> Result<LlmResponse> {
422        let api_key = self
423            .config
424            .anthropic_api_key
425            .as_ref()
426            .ok_or_else(|| anyhow!("Anthropic API key not configured"))?;
427
428        let model = self.get_model_name(&LlmProvider::Anthropic);
429        let system_prompt = self.create_system_prompt();
430        let user_prompt = self.create_user_prompt(query, results);
431
432        let payload = serde_json::json!({
433            "model": model,
434            "max_tokens": self.config.max_tokens,
435            "temperature": self.config.temperature,
436            "system": system_prompt,
437            "messages": [
438                {
439                    "role": "user",
440                    "content": user_prompt
441                }
442            ]
443        });
444
445        let response = self
446            .http_client
447            .post("https://api.anthropic.com/v1/messages")
448            .header("x-api-key", api_key)
449            .header("content-type", "application/json")
450            .header("anthropic-version", "2023-06-01")
451            .json(&payload)
452            .send()
453            .await?;
454
455        if !response.status().is_success() {
456            let error_text = response.text().await?;
457            return Err(anyhow!("Anthropic API error: {}", error_text));
458        }
459
460        let response_json: serde_json::Value = response.json().await?;
461
462        let answer = response_json["content"][0]["text"]
463            .as_str()
464            .ok_or_else(|| anyhow!("Invalid Anthropic response format"))?
465            .to_string();
466
467        let usage = &response_json["usage"];
468        let tokens_used = usage["output_tokens"].as_u64().map(|t| t as u32);
469        let finish_reason = response_json["stop_reason"].as_str().map(|s| s.to_string());
470
471        let citations = self.extract_citations(&answer, results);
472
473        Ok(LlmResponse {
474            answer,
475            sources_used: results.iter().map(|r| r.id.clone()).collect(),
476            confidence: Some(0.85), // Claude typically good confidence
477            provider_used: LlmProvider::Anthropic,
478            model_used: model,
479            tokens_used,
480            response_time_ms: 0,
481            finish_reason,
482            citations,
483        })
484    }
485
486    /// Groq fast inference integration for ultra-fast responses
487    async fn synthesize_with_groq(
488        &self,
489        query: &str,
490        results: &[RagSearchResult],
491    ) -> Result<LlmResponse> {
492        let api_key = self
493            .config
494            .groq_api_key
495            .as_ref()
496            .ok_or_else(|| anyhow!("Groq API key not configured"))?;
497
498        let model = self.get_model_name(&LlmProvider::Groq);
499        let system_prompt = self.create_system_prompt();
500        let user_prompt = self.create_user_prompt(query, results);
501
502        let payload = serde_json::json!({
503            "model": model,
504            "messages": [
505                {
506                    "role": "system",
507                    "content": system_prompt
508                },
509                {
510                    "role": "user",
511                    "content": user_prompt
512                }
513            ],
514            "max_tokens": self.config.max_tokens,
515            "temperature": self.config.temperature,
516            "stream": false
517        });
518
519        let response = self
520            .http_client
521            .post("https://api.groq.com/openai/v1/chat/completions")
522            .header("Authorization", format!("Bearer {}", api_key))
523            .header("Content-Type", "application/json")
524            .json(&payload)
525            .send()
526            .await?;
527
528        if !response.status().is_success() {
529            let status = response.status();
530            let error_text = response.text().await?;
531            log::error!(
532                "Groq API error - Status: {}, Response: {}",
533                status,
534                error_text
535            );
536            return Err(anyhow!("Groq API error ({}): {}", status, error_text));
537        }
538
539        let response_json: serde_json::Value = response.json().await?;
540
541        let answer = response_json["choices"][0]["message"]["content"]
542            .as_str()
543            .ok_or_else(|| anyhow!("Invalid Groq response format"))?
544            .to_string();
545
546        let usage = &response_json["usage"];
547        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
548        let finish_reason = response_json["choices"][0]["finish_reason"]
549            .as_str()
550            .map(|s| s.to_string());
551
552        let citations = self.extract_citations(&answer, results);
553
554        Ok(LlmResponse {
555            answer,
556            sources_used: results.iter().map(|r| r.id.clone()).collect(),
557            confidence: Some(0.8), // Groq usually good quality
558            provider_used: LlmProvider::Groq,
559            model_used: model,
560            tokens_used,
561            response_time_ms: 0,
562            finish_reason,
563            citations,
564        })
565    }
566
567    /// OpenRouter multi-model gateway for access to multiple providers
568    async fn synthesize_with_openrouter(
569        &self,
570        query: &str,
571        results: &[RagSearchResult],
572    ) -> Result<LlmResponse> {
573        let api_key = self
574            .config
575            .openrouter_api_key
576            .as_ref()
577            .ok_or_else(|| anyhow!("OpenRouter API key not configured"))?;
578
579        let model = self.get_model_name(&LlmProvider::OpenRouter);
580        let system_prompt = self.create_system_prompt();
581        let user_prompt = self.create_user_prompt(query, results);
582
583        let payload = serde_json::json!({
584            "model": model,
585            "messages": [
586                {
587                    "role": "system",
588                    "content": system_prompt
589                },
590                {
591                    "role": "user",
592                    "content": user_prompt
593                }
594            ],
595            "max_tokens": self.config.max_tokens,
596            "temperature": self.config.temperature,
597            "stream": self.config.streaming
598        });
599
600        let response = self
601            .http_client
602            .post("https://openrouter.ai/api/v1/chat/completions")
603            .header("Authorization", format!("Bearer {}", api_key))
604            .header("Content-Type", "application/json")
605            .header("HTTP-Referer", "https://github.com/neur0map/manx")
606            .header("X-Title", "Manx Documentation Finder")
607            .json(&payload)
608            .send()
609            .await?;
610
611        if !response.status().is_success() {
612            let error_text = response.text().await?;
613            return Err(anyhow!("OpenRouter API error: {}", error_text));
614        }
615
616        let response_json: serde_json::Value = response.json().await?;
617
618        let answer = response_json["choices"][0]["message"]["content"]
619            .as_str()
620            .ok_or_else(|| anyhow!("Invalid OpenRouter response format"))?
621            .to_string();
622
623        let usage = &response_json["usage"];
624        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
625        let finish_reason = response_json["choices"][0]["finish_reason"]
626            .as_str()
627            .map(|s| s.to_string());
628
629        let citations = self.extract_citations(&answer, results);
630
631        Ok(LlmResponse {
632            answer,
633            sources_used: results.iter().map(|r| r.id.clone()).collect(),
634            confidence: Some(0.82), // Varies by underlying model
635            provider_used: LlmProvider::OpenRouter,
636            model_used: model,
637            tokens_used,
638            response_time_ms: 0,
639            finish_reason,
640            citations,
641        })
642    }
643
644    /// HuggingFace Router API for open-source models
645    async fn synthesize_with_huggingface(
646        &self,
647        query: &str,
648        results: &[RagSearchResult],
649    ) -> Result<LlmResponse> {
650        let api_key = self
651            .config
652            .huggingface_api_key
653            .as_ref()
654            .ok_or_else(|| anyhow!("HuggingFace API key not configured"))?;
655
656        let model = self.get_model_name(&LlmProvider::HuggingFace);
657        let system_prompt = self.create_system_prompt();
658        let user_prompt = self.create_user_prompt(query, results);
659
660        // Use OpenAI-compatible chat completions format
661        let payload = serde_json::json!({
662            "model": model,
663            "messages": [
664                {"role": "system", "content": system_prompt},
665                {"role": "user", "content": user_prompt}
666            ],
667            "max_tokens": self.config.max_tokens,
668            "temperature": self.config.temperature
669        });
670
671        let response = self
672            .http_client
673            .post("https://router.huggingface.co/v1/chat/completions")
674            .header("Authorization", format!("Bearer {}", api_key))
675            .header("Content-Type", "application/json")
676            .json(&payload)
677            .send()
678            .await?;
679
680        if !response.status().is_success() {
681            let error_text = response.text().await?;
682            return Err(anyhow!("HuggingFace API error: {}", error_text));
683        }
684
685        let response_json: serde_json::Value = response.json().await?;
686
687        let answer = if let Some(choices) = response_json["choices"].as_array() {
688            if let Some(first_choice) = choices.first() {
689                if let Some(message) = first_choice["message"].as_object() {
690                    message["content"].as_str().unwrap_or("").to_string()
691                } else {
692                    return Err(anyhow!(
693                        "Invalid HuggingFace response format: missing message"
694                    ));
695                }
696            } else {
697                return Err(anyhow!(
698                    "Invalid HuggingFace response format: empty choices"
699                ));
700            }
701        } else {
702            return Err(anyhow!(
703                "Invalid HuggingFace response format: missing choices"
704            ));
705        };
706
707        let citations = self.extract_citations(&answer, results);
708
709        Ok(LlmResponse {
710            answer,
711            sources_used: results.iter().map(|r| r.id.clone()).collect(),
712            confidence: Some(0.75), // Open source models vary
713            provider_used: LlmProvider::HuggingFace,
714            model_used: model,
715            tokens_used: response_json["usage"]["total_tokens"]
716                .as_u64()
717                .map(|t| t as u32),
718            response_time_ms: 0,
719            finish_reason: response_json["choices"][0]["finish_reason"]
720                .as_str()
721                .map(|s| s.to_string()),
722            citations,
723        })
724    }
725
726    /// Custom endpoint integration for self-hosted models
727    async fn synthesize_with_custom(
728        &self,
729        query: &str,
730        results: &[RagSearchResult],
731    ) -> Result<LlmResponse> {
732        let endpoint = self
733            .config
734            .custom_endpoint
735            .as_ref()
736            .ok_or_else(|| anyhow!("Custom endpoint not configured"))?;
737
738        let model = self.get_model_name(&LlmProvider::Custom);
739        let system_prompt = self.create_system_prompt();
740        let user_prompt = self.create_user_prompt(query, results);
741
742        // Use OpenAI-compatible format for custom endpoints
743        let payload = serde_json::json!({
744            "model": model,
745            "messages": [
746                {
747                    "role": "system",
748                    "content": system_prompt
749                },
750                {
751                    "role": "user",
752                    "content": user_prompt
753                }
754            ],
755            "max_tokens": self.config.max_tokens,
756            "temperature": self.config.temperature,
757            "stream": self.config.streaming
758        });
759
760        let response = self
761            .http_client
762            .post(format!("{}/v1/chat/completions", endpoint))
763            .header("Content-Type", "application/json")
764            .json(&payload)
765            .send()
766            .await?;
767
768        if !response.status().is_success() {
769            let error_text = response.text().await?;
770            return Err(anyhow!("Custom endpoint error: {}", error_text));
771        }
772
773        let response_json: serde_json::Value = response.json().await?;
774
775        let answer = response_json["choices"][0]["message"]["content"]
776            .as_str()
777            .ok_or_else(|| anyhow!("Invalid custom endpoint response format"))?
778            .to_string();
779
780        let usage = &response_json["usage"];
781        let tokens_used = usage
782            .get("total_tokens")
783            .and_then(|t| t.as_u64())
784            .map(|t| t as u32);
785        let finish_reason = response_json["choices"][0]
786            .get("finish_reason")
787            .and_then(|r| r.as_str())
788            .map(|s| s.to_string());
789
790        let citations = self.extract_citations(&answer, results);
791
792        Ok(LlmResponse {
793            answer,
794            sources_used: results.iter().map(|r| r.id.clone()).collect(),
795            confidence: Some(0.8), // Assume reasonable confidence for custom
796            provider_used: LlmProvider::Custom,
797            model_used: model,
798            tokens_used,
799            response_time_ms: 0,
800            finish_reason,
801            citations,
802        })
803    }
804}