manx_cli/rag/
llm.rs

1//! Multi-provider LLM integration for answer synthesis
2//!
3//! Supports OpenAI GPT, Anthropic Claude, Groq, OpenRouter, HuggingFace, and custom endpoints
4//! with automatic failover and comprehensive error handling.
5
6use anyhow::{anyhow, Result};
7use serde::{Deserialize, Serialize};
8
9use crate::rag::RagSearchResult;
10
11/// Configuration for LLM integration supporting multiple providers
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct LlmConfig {
14    pub openai_api_key: Option<String>,
15    pub anthropic_api_key: Option<String>,
16    pub groq_api_key: Option<String>,
17    pub openrouter_api_key: Option<String>,
18    pub huggingface_api_key: Option<String>,
19    pub custom_endpoint: Option<String>,
20    pub preferred_provider: LlmProvider,
21    pub fallback_providers: Vec<LlmProvider>,
22    pub timeout_seconds: u64,
23    pub max_tokens: u32,
24    pub temperature: f32,
25    pub model_name: Option<String>,
26    pub streaming: bool,
27}
28
29impl Default for LlmConfig {
30    fn default() -> Self {
31        Self {
32            openai_api_key: None,
33            anthropic_api_key: None,
34            groq_api_key: None,
35            openrouter_api_key: None,
36            huggingface_api_key: None,
37            custom_endpoint: None,
38            preferred_provider: LlmProvider::Auto,
39            fallback_providers: vec![
40                LlmProvider::OpenAI,
41                LlmProvider::Anthropic,
42                LlmProvider::Groq,
43                LlmProvider::OpenRouter,
44            ],
45            timeout_seconds: 30,
46            max_tokens: 1000,
47            temperature: 0.1,
48            model_name: None,
49            streaming: false,
50        }
51    }
52}
53
54/// Available LLM providers
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub enum LlmProvider {
57    Auto,
58    OpenAI,
59    Anthropic,
60    Groq,
61    OpenRouter,
62    HuggingFace,
63    Custom,
64}
65
66/// LLM response with comprehensive metadata
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct LlmResponse {
69    pub answer: String,
70    pub sources_used: Vec<String>,
71    pub confidence: Option<f32>,
72    pub provider_used: LlmProvider,
73    pub model_used: String,
74    pub tokens_used: Option<u32>,
75    pub response_time_ms: u64,
76    pub finish_reason: Option<String>,
77    pub citations: Vec<Citation>,
78}
79
80/// Citation information linking to source documents
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct Citation {
83    pub source_id: String,
84    pub source_title: String,
85    pub source_url: Option<String>,
86    pub relevance_score: f32,
87    pub excerpt: String,
88}
89
90/// Multi-provider LLM client with automatic failover
91#[derive(Clone)]
92pub struct LlmClient {
93    pub(crate) config: LlmConfig,
94    pub(crate) http_client: reqwest::Client,
95}
96
97impl LlmClient {
98    /// Create a new LLM client with configuration
99    pub fn new(config: LlmConfig) -> Result<Self> {
100        let http_client = reqwest::Client::builder()
101            .timeout(std::time::Duration::from_secs(config.timeout_seconds))
102            .build()?;
103
104        Ok(Self {
105            config,
106            http_client,
107        })
108    }
109
110    /// Check if any LLM provider is available
111    pub fn is_available(&self) -> bool {
112        self.has_openai_key()
113            || self.has_anthropic_key()
114            || self.has_groq_key()
115            || self.has_openrouter_key()
116            || self.has_huggingface_key()
117            || self.config.custom_endpoint.is_some()
118    }
119
120    /// Check availability of specific providers
121    pub fn has_openai_key(&self) -> bool {
122        self.config
123            .openai_api_key
124            .as_ref()
125            .is_some_and(|key| !key.is_empty())
126    }
127
128    pub fn has_anthropic_key(&self) -> bool {
129        self.config
130            .anthropic_api_key
131            .as_ref()
132            .is_some_and(|key| !key.is_empty())
133    }
134
135    pub fn has_groq_key(&self) -> bool {
136        self.config
137            .groq_api_key
138            .as_ref()
139            .is_some_and(|key| !key.is_empty())
140    }
141
142    pub fn has_openrouter_key(&self) -> bool {
143        self.config
144            .openrouter_api_key
145            .as_ref()
146            .is_some_and(|key| !key.is_empty())
147    }
148
149    pub fn has_huggingface_key(&self) -> bool {
150        self.config
151            .huggingface_api_key
152            .as_ref()
153            .is_some_and(|key| !key.is_empty())
154    }
155
156    /// Get the best available provider based on configuration and API key availability
157    pub fn get_best_provider(&self) -> Option<LlmProvider> {
158        if self.config.preferred_provider != LlmProvider::Auto {
159            // Check if preferred provider is available
160            if self.is_provider_available(&self.config.preferred_provider) {
161                return Some(self.config.preferred_provider.clone());
162            }
163        }
164
165        // Try fallback providers in order
166        for provider in &self.config.fallback_providers {
167            if self.is_provider_available(provider) {
168                return Some(provider.clone());
169            }
170        }
171
172        None
173    }
174
175    /// Check if a specific provider is available
176    pub fn is_provider_available(&self, provider: &LlmProvider) -> bool {
177        match provider {
178            LlmProvider::OpenAI => self.has_openai_key(),
179            LlmProvider::Anthropic => self.has_anthropic_key(),
180            LlmProvider::Groq => self.has_groq_key(),
181            LlmProvider::OpenRouter => self.has_openrouter_key(),
182            LlmProvider::HuggingFace => self.has_huggingface_key(),
183            LlmProvider::Custom => self.config.custom_endpoint.is_some(),
184            LlmProvider::Auto => false, // Auto is not a real provider
185        }
186    }
187
188    /// Synthesize an answer from search results using the best available provider
189    pub async fn synthesize_answer(
190        &self,
191        query: &str,
192        results: &[RagSearchResult],
193    ) -> Result<LlmResponse> {
194        let provider = self
195            .get_best_provider()
196            .ok_or_else(|| anyhow!("No LLM provider available"))?;
197
198        let start_time = std::time::Instant::now();
199
200        let response = match provider {
201            LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
202            LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
203            LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
204            LlmProvider::OpenRouter => self.synthesize_with_openrouter(query, results).await,
205            LlmProvider::HuggingFace => self.synthesize_with_huggingface(query, results).await,
206            LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
207            LlmProvider::Auto => unreachable!(),
208        };
209
210        // If primary provider fails, try fallback providers
211        match response {
212            Ok(mut resp) => {
213                resp.response_time_ms = start_time.elapsed().as_millis() as u64;
214                Ok(resp)
215            }
216            Err(e) => {
217                log::warn!("Primary provider {:?} failed: {}", provider, e);
218                self.try_fallback_providers(query, results, &provider).await
219            }
220        }
221    }
222
223    /// Try fallback providers if primary fails
224    async fn try_fallback_providers(
225        &self,
226        query: &str,
227        results: &[RagSearchResult],
228        failed_provider: &LlmProvider,
229    ) -> Result<LlmResponse> {
230        for provider in &self.config.fallback_providers {
231            if provider != failed_provider && self.is_provider_available(provider) {
232                log::info!("Trying fallback provider: {:?}", provider);
233
234                let start_time = std::time::Instant::now();
235                let response = match provider {
236                    LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
237                    LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
238                    LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
239                    LlmProvider::OpenRouter => {
240                        self.synthesize_with_openrouter(query, results).await
241                    }
242                    LlmProvider::HuggingFace => {
243                        self.synthesize_with_huggingface(query, results).await
244                    }
245                    LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
246                    LlmProvider::Auto => continue,
247                };
248
249                if let Ok(mut resp) = response {
250                    resp.response_time_ms = start_time.elapsed().as_millis() as u64;
251                    return Ok(resp);
252                }
253            }
254        }
255
256        Err(anyhow!("All LLM providers failed"))
257    }
258
259    /// Get the appropriate model name for a provider
260    fn get_model_name(&self, provider: &LlmProvider) -> String {
261        if let Some(model) = &self.config.model_name {
262            return model.clone();
263        }
264
265        match provider {
266            LlmProvider::OpenAI => "gpt-4o-mini".to_string(),
267            LlmProvider::Anthropic => "claude-3-haiku-20240307".to_string(),
268            LlmProvider::Groq => "llama-3.1-8b-instant".to_string(),
269            LlmProvider::OpenRouter => "openai/gpt-3.5-turbo".to_string(),
270            LlmProvider::HuggingFace => "microsoft/DialoGPT-medium".to_string(),
271            LlmProvider::Custom => "custom-model".to_string(),
272            LlmProvider::Auto => "auto".to_string(),
273        }
274    }
275
276    /// Create concise system prompt focused on clean, scannable output
277    fn create_system_prompt(&self) -> String {
278        r#"You are a concise technical documentation assistant. Provide clear, scannable answers based ONLY on the provided search results.
279
280RESPONSE FORMAT:
2811. **Quick Answer** (1-2 sentences max)
2822. **Key Points** (bullet points, max 4 items)  
2833. **Code Example** (if available - keep it short and practical)
284
285RULES:
286- Be extremely concise and scannable
287- Use bullet points and short paragraphs
288- Only include essential information
289- Cite sources as [Source N] 
290- Never add information not in the sources
291- Focus on what developers need to know immediately
292
293STYLE:
294- Write for busy developers who want quick answers
295- Use clear, simple language
296- Keep code examples minimal but complete
297- Prioritize readability over completeness"#.to_string()
298    }
299
300    /// Create user prompt with query and search results
301    fn create_user_prompt(&self, query: &str, results: &[RagSearchResult]) -> String {
302        let mut prompt = format!("Question: {}\n\nSearch Results:\n\n", query);
303
304        for (i, result) in results.iter().enumerate() {
305            prompt.push_str(&format!(
306                "[Source {}] {}\nURL: {}\nContent: {}\n\n",
307                i + 1,
308                result.title.as_ref().unwrap_or(&"Untitled".to_string()),
309                result.source_path.to_string_lossy(),
310                result.content.chars().take(1000).collect::<String>()
311            ));
312        }
313
314        prompt.push_str("\nPlease provide a comprehensive answer based on these search results.");
315        prompt
316    }
317
318    /// Extract the actual answer from responses that may contain thinking content
319    fn extract_final_answer(&self, response_text: &str) -> String {
320        // Handle models with thinking capabilities - check for both <thinking> and <think> tags
321        if response_text.contains("<thinking>") && response_text.contains("</thinking>") {
322            // Find the end of the thinking section
323            if let Some(thinking_end) = response_text.find("</thinking>") {
324                let after_thinking = &response_text[thinking_end + "</thinking>".len()..];
325                return after_thinking.trim().to_string();
326            }
327        }
328
329        // Handle models that use <think> tags instead of <thinking>
330        if response_text.contains("<think>") && response_text.contains("</think>") {
331            // Find the end of the think section
332            if let Some(think_end) = response_text.find("</think>") {
333                let after_think = &response_text[think_end + "</think>".len()..];
334                return after_think.trim().to_string();
335            }
336        }
337
338        // Handle models that might use other thinking patterns
339        // Some models use patterns like "Let me think about this..." followed by the actual answer
340        if response_text.starts_with("Let me think") || response_text.starts_with("I need to think")
341        {
342            // Look for common transition phrases that indicate the start of the actual answer
343            let transition_phrases = [
344                "Here's my answer:",
345                "My answer is:",
346                "To answer your question:",
347                "Based on the search results:",
348                "The answer is:",
349                "\n\n**", // Common formatting transition
350                "\n\nQuick Answer:",
351                "\n\n##", // Markdown heading transition
352            ];
353
354            for phrase in &transition_phrases {
355                if let Some(pos) = response_text.find(phrase) {
356                    let answer_start = if phrase.starts_with('\n') {
357                        pos + 2 // Skip the newlines
358                    } else {
359                        pos + phrase.len()
360                    };
361                    return response_text[answer_start..].trim().to_string();
362                }
363            }
364        }
365
366        // For other models or no thinking pattern detected, return the full response
367        response_text.to_string()
368    }
369
370    /// Extract citations from LLM response
371    fn extract_citations(&self, response_text: &str, results: &[RagSearchResult]) -> Vec<Citation> {
372        let mut citations = Vec::new();
373
374        // Simple citation extraction - look for [Source N] patterns
375        for (i, result) in results.iter().enumerate() {
376            let source_ref = format!("[Source {}]", i + 1);
377            if response_text.contains(&source_ref) {
378                citations.push(Citation {
379                    source_id: result.id.clone(),
380                    source_title: result
381                        .title
382                        .clone()
383                        .unwrap_or_else(|| "Untitled".to_string()),
384                    source_url: Some(result.source_path.to_string_lossy().to_string()),
385                    relevance_score: result.score,
386                    excerpt: result.content.chars().take(200).collect(),
387                });
388            }
389        }
390
391        citations
392    }
393
394    /// OpenAI GPT integration with streaming support
395    async fn synthesize_with_openai(
396        &self,
397        query: &str,
398        results: &[RagSearchResult],
399    ) -> Result<LlmResponse> {
400        let api_key = self
401            .config
402            .openai_api_key
403            .as_ref()
404            .ok_or_else(|| anyhow!("OpenAI API key not configured"))?;
405
406        let model = self.get_model_name(&LlmProvider::OpenAI);
407        let system_prompt = self.create_system_prompt();
408        let user_prompt = self.create_user_prompt(query, results);
409
410        let payload = serde_json::json!({
411            "model": model,
412            "messages": [
413                {
414                    "role": "system",
415                    "content": system_prompt
416                },
417                {
418                    "role": "user",
419                    "content": user_prompt
420                }
421            ],
422            "max_tokens": self.config.max_tokens,
423            "temperature": self.config.temperature,
424            "stream": self.config.streaming
425        });
426
427        let response = self
428            .http_client
429            .post("https://api.openai.com/v1/chat/completions")
430            .header("Authorization", format!("Bearer {}", api_key))
431            .header("Content-Type", "application/json")
432            .json(&payload)
433            .send()
434            .await?;
435
436        if !response.status().is_success() {
437            let error_text = response.text().await?;
438            return Err(anyhow!("OpenAI API error: {}", error_text));
439        }
440
441        let response_json: serde_json::Value = response.json().await?;
442
443        let raw_answer = response_json["choices"][0]["message"]["content"]
444            .as_str()
445            .ok_or_else(|| anyhow!("Invalid OpenAI response format"))?;
446        let answer = self.extract_final_answer(raw_answer);
447
448        let usage = &response_json["usage"];
449        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
450        let finish_reason = response_json["choices"][0]["finish_reason"]
451            .as_str()
452            .map(|s| s.to_string());
453
454        let citations = self.extract_citations(&answer, results);
455
456        Ok(LlmResponse {
457            answer,
458            sources_used: results.iter().map(|r| r.id.clone()).collect(),
459            confidence: Some(0.9), // OpenAI typically high confidence
460            provider_used: LlmProvider::OpenAI,
461            model_used: model,
462            tokens_used,
463            response_time_ms: 0, // Will be set by caller
464            finish_reason,
465            citations,
466        })
467    }
468
469    /// Anthropic Claude integration with function calling support
470    async fn synthesize_with_anthropic(
471        &self,
472        query: &str,
473        results: &[RagSearchResult],
474    ) -> Result<LlmResponse> {
475        let api_key = self
476            .config
477            .anthropic_api_key
478            .as_ref()
479            .ok_or_else(|| anyhow!("Anthropic API key not configured"))?;
480
481        let model = self.get_model_name(&LlmProvider::Anthropic);
482        let system_prompt = self.create_system_prompt();
483        let user_prompt = self.create_user_prompt(query, results);
484
485        let payload = serde_json::json!({
486            "model": model,
487            "max_tokens": self.config.max_tokens,
488            "temperature": self.config.temperature,
489            "system": system_prompt,
490            "messages": [
491                {
492                    "role": "user",
493                    "content": user_prompt
494                }
495            ]
496        });
497
498        let response = self
499            .http_client
500            .post("https://api.anthropic.com/v1/messages")
501            .header("x-api-key", api_key)
502            .header("content-type", "application/json")
503            .header("anthropic-version", "2023-06-01")
504            .json(&payload)
505            .send()
506            .await?;
507
508        if !response.status().is_success() {
509            let error_text = response.text().await?;
510            return Err(anyhow!("Anthropic API error: {}", error_text));
511        }
512
513        let response_json: serde_json::Value = response.json().await?;
514
515        let raw_answer = response_json["content"][0]["text"]
516            .as_str()
517            .ok_or_else(|| anyhow!("Invalid Anthropic response format"))?;
518        let answer = self.extract_final_answer(raw_answer);
519
520        let usage = &response_json["usage"];
521        let tokens_used = usage["output_tokens"].as_u64().map(|t| t as u32);
522        let finish_reason = response_json["stop_reason"].as_str().map(|s| s.to_string());
523
524        let citations = self.extract_citations(&answer, results);
525
526        Ok(LlmResponse {
527            answer,
528            sources_used: results.iter().map(|r| r.id.clone()).collect(),
529            confidence: Some(0.85), // Claude typically good confidence
530            provider_used: LlmProvider::Anthropic,
531            model_used: model,
532            tokens_used,
533            response_time_ms: 0,
534            finish_reason,
535            citations,
536        })
537    }
538
539    /// Groq fast inference integration for ultra-fast responses
540    async fn synthesize_with_groq(
541        &self,
542        query: &str,
543        results: &[RagSearchResult],
544    ) -> Result<LlmResponse> {
545        let api_key = self
546            .config
547            .groq_api_key
548            .as_ref()
549            .ok_or_else(|| anyhow!("Groq API key not configured"))?;
550
551        let model = self.get_model_name(&LlmProvider::Groq);
552        let system_prompt = self.create_system_prompt();
553        let user_prompt = self.create_user_prompt(query, results);
554
555        let payload = serde_json::json!({
556            "model": model,
557            "messages": [
558                {
559                    "role": "system",
560                    "content": system_prompt
561                },
562                {
563                    "role": "user",
564                    "content": user_prompt
565                }
566            ],
567            "max_tokens": self.config.max_tokens,
568            "temperature": self.config.temperature,
569            "stream": false
570        });
571
572        let response = self
573            .http_client
574            .post("https://api.groq.com/openai/v1/chat/completions")
575            .header("Authorization", format!("Bearer {}", api_key))
576            .header("Content-Type", "application/json")
577            .json(&payload)
578            .send()
579            .await?;
580
581        if !response.status().is_success() {
582            let status = response.status();
583            let error_text = response.text().await?;
584            log::error!(
585                "Groq API error - Status: {}, Response: {}",
586                status,
587                error_text
588            );
589            return Err(anyhow!("Groq API error ({}): {}", status, error_text));
590        }
591
592        let response_json: serde_json::Value = response.json().await?;
593
594        let raw_answer = response_json["choices"][0]["message"]["content"]
595            .as_str()
596            .ok_or_else(|| anyhow!("Invalid Groq response format"))?;
597        let answer = self.extract_final_answer(raw_answer);
598
599        let usage = &response_json["usage"];
600        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
601        let finish_reason = response_json["choices"][0]["finish_reason"]
602            .as_str()
603            .map(|s| s.to_string());
604
605        let citations = self.extract_citations(&answer, results);
606
607        Ok(LlmResponse {
608            answer,
609            sources_used: results.iter().map(|r| r.id.clone()).collect(),
610            confidence: Some(0.8), // Groq usually good quality
611            provider_used: LlmProvider::Groq,
612            model_used: model,
613            tokens_used,
614            response_time_ms: 0,
615            finish_reason,
616            citations,
617        })
618    }
619
620    /// OpenRouter multi-model gateway for access to multiple providers
621    async fn synthesize_with_openrouter(
622        &self,
623        query: &str,
624        results: &[RagSearchResult],
625    ) -> Result<LlmResponse> {
626        let api_key = self
627            .config
628            .openrouter_api_key
629            .as_ref()
630            .ok_or_else(|| anyhow!("OpenRouter API key not configured"))?;
631
632        let model = self.get_model_name(&LlmProvider::OpenRouter);
633        let system_prompt = self.create_system_prompt();
634        let user_prompt = self.create_user_prompt(query, results);
635
636        let payload = serde_json::json!({
637            "model": model,
638            "messages": [
639                {
640                    "role": "system",
641                    "content": system_prompt
642                },
643                {
644                    "role": "user",
645                    "content": user_prompt
646                }
647            ],
648            "max_tokens": self.config.max_tokens,
649            "temperature": self.config.temperature,
650            "stream": self.config.streaming
651        });
652
653        let response = self
654            .http_client
655            .post("https://openrouter.ai/api/v1/chat/completions")
656            .header("Authorization", format!("Bearer {}", api_key))
657            .header("Content-Type", "application/json")
658            .header("HTTP-Referer", "https://github.com/neur0map/manx")
659            .header("X-Title", "Manx Documentation Finder")
660            .json(&payload)
661            .send()
662            .await?;
663
664        if !response.status().is_success() {
665            let error_text = response.text().await?;
666            return Err(anyhow!("OpenRouter API error: {}", error_text));
667        }
668
669        let response_json: serde_json::Value = response.json().await?;
670
671        let raw_answer = response_json["choices"][0]["message"]["content"]
672            .as_str()
673            .ok_or_else(|| anyhow!("Invalid OpenRouter response format"))?;
674        let answer = self.extract_final_answer(raw_answer);
675
676        let usage = &response_json["usage"];
677        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
678        let finish_reason = response_json["choices"][0]["finish_reason"]
679            .as_str()
680            .map(|s| s.to_string());
681
682        let citations = self.extract_citations(&answer, results);
683
684        Ok(LlmResponse {
685            answer,
686            sources_used: results.iter().map(|r| r.id.clone()).collect(),
687            confidence: Some(0.82), // Varies by underlying model
688            provider_used: LlmProvider::OpenRouter,
689            model_used: model,
690            tokens_used,
691            response_time_ms: 0,
692            finish_reason,
693            citations,
694        })
695    }
696
697    /// HuggingFace Router API for open-source models
698    async fn synthesize_with_huggingface(
699        &self,
700        query: &str,
701        results: &[RagSearchResult],
702    ) -> Result<LlmResponse> {
703        let api_key = self
704            .config
705            .huggingface_api_key
706            .as_ref()
707            .ok_or_else(|| anyhow!("HuggingFace API key not configured"))?;
708
709        let model = self.get_model_name(&LlmProvider::HuggingFace);
710        let system_prompt = self.create_system_prompt();
711        let user_prompt = self.create_user_prompt(query, results);
712
713        // Use OpenAI-compatible chat completions format
714        let payload = serde_json::json!({
715            "model": model,
716            "messages": [
717                {"role": "system", "content": system_prompt},
718                {"role": "user", "content": user_prompt}
719            ],
720            "max_tokens": self.config.max_tokens,
721            "temperature": self.config.temperature
722        });
723
724        let response = self
725            .http_client
726            .post("https://router.huggingface.co/v1/chat/completions")
727            .header("Authorization", format!("Bearer {}", api_key))
728            .header("Content-Type", "application/json")
729            .json(&payload)
730            .send()
731            .await?;
732
733        if !response.status().is_success() {
734            let error_text = response.text().await?;
735            return Err(anyhow!("HuggingFace API error: {}", error_text));
736        }
737
738        let response_json: serde_json::Value = response.json().await?;
739
740        let raw_answer = if let Some(choices) = response_json["choices"].as_array() {
741            if let Some(first_choice) = choices.first() {
742                if let Some(message) = first_choice["message"].as_object() {
743                    message["content"].as_str().unwrap_or("")
744                } else {
745                    return Err(anyhow!(
746                        "Invalid HuggingFace response format: missing message"
747                    ));
748                }
749            } else {
750                return Err(anyhow!(
751                    "Invalid HuggingFace response format: empty choices"
752                ));
753            }
754        } else {
755            return Err(anyhow!(
756                "Invalid HuggingFace response format: missing choices"
757            ));
758        };
759
760        let answer = self.extract_final_answer(raw_answer);
761
762        let citations = self.extract_citations(&answer, results);
763
764        Ok(LlmResponse {
765            answer,
766            sources_used: results.iter().map(|r| r.id.clone()).collect(),
767            confidence: Some(0.75), // Open source models vary
768            provider_used: LlmProvider::HuggingFace,
769            model_used: model,
770            tokens_used: response_json["usage"]["total_tokens"]
771                .as_u64()
772                .map(|t| t as u32),
773            response_time_ms: 0,
774            finish_reason: response_json["choices"][0]["finish_reason"]
775                .as_str()
776                .map(|s| s.to_string()),
777            citations,
778        })
779    }
780
781    /// Custom endpoint integration for self-hosted models
782    async fn synthesize_with_custom(
783        &self,
784        query: &str,
785        results: &[RagSearchResult],
786    ) -> Result<LlmResponse> {
787        let endpoint = self
788            .config
789            .custom_endpoint
790            .as_ref()
791            .ok_or_else(|| anyhow!("Custom endpoint not configured"))?;
792
793        let model = self.get_model_name(&LlmProvider::Custom);
794        let system_prompt = self.create_system_prompt();
795        let user_prompt = self.create_user_prompt(query, results);
796
797        // Use OpenAI-compatible format for custom endpoints
798        let payload = serde_json::json!({
799            "model": model,
800            "messages": [
801                {
802                    "role": "system",
803                    "content": system_prompt
804                },
805                {
806                    "role": "user",
807                    "content": user_prompt
808                }
809            ],
810            "max_tokens": self.config.max_tokens,
811            "temperature": self.config.temperature,
812            "stream": self.config.streaming
813        });
814
815        let response = self
816            .http_client
817            .post(format!("{}/v1/chat/completions", endpoint))
818            .header("Content-Type", "application/json")
819            .json(&payload)
820            .send()
821            .await?;
822
823        if !response.status().is_success() {
824            let error_text = response.text().await?;
825            return Err(anyhow!("Custom endpoint error: {}", error_text));
826        }
827
828        let response_json: serde_json::Value = response.json().await?;
829
830        let raw_answer = response_json["choices"][0]["message"]["content"]
831            .as_str()
832            .ok_or_else(|| anyhow!("Invalid custom endpoint response format"))?;
833        let answer = self.extract_final_answer(raw_answer);
834
835        let usage = &response_json["usage"];
836        let tokens_used = usage
837            .get("total_tokens")
838            .and_then(|t| t.as_u64())
839            .map(|t| t as u32);
840        let finish_reason = response_json["choices"][0]
841            .get("finish_reason")
842            .and_then(|r| r.as_str())
843            .map(|s| s.to_string());
844
845        let citations = self.extract_citations(&answer, results);
846
847        Ok(LlmResponse {
848            answer,
849            sources_used: results.iter().map(|r| r.id.clone()).collect(),
850            confidence: Some(0.8), // Assume reasonable confidence for custom
851            provider_used: LlmProvider::Custom,
852            model_used: model,
853            tokens_used,
854            response_time_ms: 0,
855            finish_reason,
856            citations,
857        })
858    }
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864
865    #[test]
866    fn test_extract_final_answer_with_thinking_tags() {
867        let client = LlmClient::new(LlmConfig::default()).unwrap();
868
869        let response_with_thinking = r#"<thinking>
870Let me analyze this query about Rust error handling.
871
872The user is asking about Result types and how to handle errors properly.
873I should explain the basics of Result<T, E> and common patterns.
874</thinking>
875
876**Quick Answer**
877Rust uses `Result<T, E>` for error handling, where `T` is the success type and `E` is the error type.
878
879**Key Points**
880- Use `?` operator for error propagation
881- `unwrap()` panics on error, avoid in production
882- `expect()` provides custom panic message
883- Pattern match with `match` for comprehensive handling"#;
884
885        let extracted = client.extract_final_answer(response_with_thinking);
886
887        assert!(!extracted.contains("<thinking>"));
888        assert!(!extracted.contains("</thinking>"));
889        assert!(extracted.contains("**Quick Answer**"));
890        assert!(extracted.contains("Result<T, E>"));
891    }
892
893    #[test]
894    fn test_extract_final_answer_with_think_tags() {
895        let client = LlmClient::new(LlmConfig::default()).unwrap();
896
897        let response_with_think = r#"<think>
898This question is about JavaScript async/await patterns.
899
900The user wants to understand how to handle asynchronous operations.
901I should provide clear examples and best practices.
902</think>
903
904**Quick Answer**
905Use `async/await` for handling asynchronous operations in JavaScript.
906
907**Key Points**
908- `async` functions return Promises
909- `await` pauses execution until Promise resolves
910- Use try/catch for error handling
911- Avoid callback hell with Promise chains"#;
912
913        let extracted = client.extract_final_answer(response_with_think);
914
915        assert!(!extracted.contains("<think>"));
916        assert!(!extracted.contains("</think>"));
917        assert!(extracted.contains("**Quick Answer**"));
918        assert!(extracted.contains("async/await"));
919    }
920
921    #[test]
922    fn test_extract_final_answer_without_thinking() {
923        let client = LlmClient::new(LlmConfig::default()).unwrap();
924
925        let normal_response = r#"**Quick Answer**
926This is a normal response without thinking tags.
927
928**Key Points**
929- Point 1
930- Point 2"#;
931
932        let extracted = client.extract_final_answer(normal_response);
933
934        assert_eq!(extracted, normal_response);
935    }
936
937    #[test]
938    fn test_extract_final_answer_with_thinking_prefix() {
939        let client = LlmClient::new(LlmConfig::default()).unwrap();
940
941        let response_with_prefix = r#"Let me think about this question carefully...
942
943I need to consider the different aspects of the query.
944
945Based on the search results:
946
947**Quick Answer**
948Here is the actual answer after thinking.
949
950**Key Points**
951- Important point 1
952- Important point 2"#;
953
954        let extracted = client.extract_final_answer(response_with_prefix);
955
956        assert!(!extracted.contains("Let me think"));
957        assert!(extracted.contains("**Quick Answer**"));
958        assert!(extracted.contains("Here is the actual answer"));
959    }
960}