manx_cli/rag/
llm.rs

1//! Multi-provider LLM integration for answer synthesis
2//!
3//! Supports OpenAI GPT, Anthropic Claude, Groq, OpenRouter, HuggingFace, and custom endpoints
4//! with automatic failover and comprehensive error handling.
5
6use anyhow::{anyhow, Result};
7use serde::{Deserialize, Serialize};
8
9use crate::rag::RagSearchResult;
10
11/// Configuration for LLM integration supporting multiple providers
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct LlmConfig {
14    pub openai_api_key: Option<String>,
15    pub anthropic_api_key: Option<String>,
16    pub groq_api_key: Option<String>,
17    pub openrouter_api_key: Option<String>,
18    pub huggingface_api_key: Option<String>,
19    pub custom_endpoint: Option<String>,
20    pub preferred_provider: LlmProvider,
21    pub fallback_providers: Vec<LlmProvider>,
22    pub timeout_seconds: u64,
23    pub max_tokens: u32,
24    pub temperature: f32,
25    pub model_name: Option<String>,
26    pub streaming: bool,
27}
28
29impl Default for LlmConfig {
30    fn default() -> Self {
31        Self {
32            openai_api_key: None,
33            anthropic_api_key: None,
34            groq_api_key: None,
35            openrouter_api_key: None,
36            huggingface_api_key: None,
37            custom_endpoint: None,
38            preferred_provider: LlmProvider::Auto,
39            fallback_providers: vec![
40                LlmProvider::OpenAI,
41                LlmProvider::Anthropic,
42                LlmProvider::Groq,
43                LlmProvider::OpenRouter,
44            ],
45            timeout_seconds: 30,
46            max_tokens: 1000,
47            temperature: 0.1,
48            model_name: None,
49            streaming: false,
50        }
51    }
52}
53
54/// Available LLM providers
55#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
56pub enum LlmProvider {
57    Auto,
58    OpenAI,
59    Anthropic,
60    Groq,
61    OpenRouter,
62    HuggingFace,
63    Custom,
64}
65
66/// LLM response with comprehensive metadata
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct LlmResponse {
69    pub answer: String,
70    pub sources_used: Vec<String>,
71    pub confidence: Option<f32>,
72    pub provider_used: LlmProvider,
73    pub model_used: String,
74    pub tokens_used: Option<u32>,
75    pub response_time_ms: u64,
76    pub finish_reason: Option<String>,
77    pub citations: Vec<Citation>,
78}
79
80/// Citation information linking to source documents
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct Citation {
83    pub source_id: String,
84    pub source_title: String,
85    pub source_url: Option<String>,
86    pub relevance_score: f32,
87    pub excerpt: String,
88}
89
90/// Multi-provider LLM client with automatic failover
91pub struct LlmClient {
92    pub(crate) config: LlmConfig,
93    pub(crate) http_client: reqwest::Client,
94}
95
96impl LlmClient {
97    /// Create a new LLM client with configuration
98    pub fn new(config: LlmConfig) -> Result<Self> {
99        let http_client = reqwest::Client::builder()
100            .timeout(std::time::Duration::from_secs(config.timeout_seconds))
101            .build()?;
102
103        Ok(Self {
104            config,
105            http_client,
106        })
107    }
108
109    /// Check if any LLM provider is available
110    pub fn is_available(&self) -> bool {
111        self.has_openai_key()
112            || self.has_anthropic_key()
113            || self.has_groq_key()
114            || self.has_openrouter_key()
115            || self.has_huggingface_key()
116            || self.config.custom_endpoint.is_some()
117    }
118
119    /// Check availability of specific providers
120    pub fn has_openai_key(&self) -> bool {
121        self.config
122            .openai_api_key
123            .as_ref()
124            .is_some_and(|key| !key.is_empty())
125    }
126
127    pub fn has_anthropic_key(&self) -> bool {
128        self.config
129            .anthropic_api_key
130            .as_ref()
131            .is_some_and(|key| !key.is_empty())
132    }
133
134    pub fn has_groq_key(&self) -> bool {
135        self.config
136            .groq_api_key
137            .as_ref()
138            .is_some_and(|key| !key.is_empty())
139    }
140
141    pub fn has_openrouter_key(&self) -> bool {
142        self.config
143            .openrouter_api_key
144            .as_ref()
145            .is_some_and(|key| !key.is_empty())
146    }
147
148    pub fn has_huggingface_key(&self) -> bool {
149        self.config
150            .huggingface_api_key
151            .as_ref()
152            .is_some_and(|key| !key.is_empty())
153    }
154
155    /// Get the best available provider based on configuration and API key availability
156    pub fn get_best_provider(&self) -> Option<LlmProvider> {
157        if self.config.preferred_provider != LlmProvider::Auto {
158            // Check if preferred provider is available
159            if self.is_provider_available(&self.config.preferred_provider) {
160                return Some(self.config.preferred_provider.clone());
161            }
162        }
163
164        // Try fallback providers in order
165        for provider in &self.config.fallback_providers {
166            if self.is_provider_available(provider) {
167                return Some(provider.clone());
168            }
169        }
170
171        None
172    }
173
174    /// Check if a specific provider is available
175    pub fn is_provider_available(&self, provider: &LlmProvider) -> bool {
176        match provider {
177            LlmProvider::OpenAI => self.has_openai_key(),
178            LlmProvider::Anthropic => self.has_anthropic_key(),
179            LlmProvider::Groq => self.has_groq_key(),
180            LlmProvider::OpenRouter => self.has_openrouter_key(),
181            LlmProvider::HuggingFace => self.has_huggingface_key(),
182            LlmProvider::Custom => self.config.custom_endpoint.is_some(),
183            LlmProvider::Auto => false, // Auto is not a real provider
184        }
185    }
186
187    /// Synthesize an answer from search results using the best available provider
188    pub async fn synthesize_answer(
189        &self,
190        query: &str,
191        results: &[RagSearchResult],
192    ) -> Result<LlmResponse> {
193        let provider = self
194            .get_best_provider()
195            .ok_or_else(|| anyhow!("No LLM provider available"))?;
196
197        let start_time = std::time::Instant::now();
198
199        let response = match provider {
200            LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
201            LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
202            LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
203            LlmProvider::OpenRouter => self.synthesize_with_openrouter(query, results).await,
204            LlmProvider::HuggingFace => self.synthesize_with_huggingface(query, results).await,
205            LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
206            LlmProvider::Auto => unreachable!(),
207        };
208
209        // If primary provider fails, try fallback providers
210        match response {
211            Ok(mut resp) => {
212                resp.response_time_ms = start_time.elapsed().as_millis() as u64;
213                Ok(resp)
214            }
215            Err(e) => {
216                log::warn!("Primary provider {:?} failed: {}", provider, e);
217                self.try_fallback_providers(query, results, &provider).await
218            }
219        }
220    }
221
222    /// Try fallback providers if primary fails
223    async fn try_fallback_providers(
224        &self,
225        query: &str,
226        results: &[RagSearchResult],
227        failed_provider: &LlmProvider,
228    ) -> Result<LlmResponse> {
229        for provider in &self.config.fallback_providers {
230            if provider != failed_provider && self.is_provider_available(provider) {
231                log::info!("Trying fallback provider: {:?}", provider);
232
233                let start_time = std::time::Instant::now();
234                let response = match provider {
235                    LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
236                    LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
237                    LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
238                    LlmProvider::OpenRouter => {
239                        self.synthesize_with_openrouter(query, results).await
240                    }
241                    LlmProvider::HuggingFace => {
242                        self.synthesize_with_huggingface(query, results).await
243                    }
244                    LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
245                    LlmProvider::Auto => continue,
246                };
247
248                if let Ok(mut resp) = response {
249                    resp.response_time_ms = start_time.elapsed().as_millis() as u64;
250                    return Ok(resp);
251                }
252            }
253        }
254
255        Err(anyhow!("All LLM providers failed"))
256    }
257
258    /// Get the appropriate model name for a provider
259    fn get_model_name(&self, provider: &LlmProvider) -> String {
260        if let Some(model) = &self.config.model_name {
261            return model.clone();
262        }
263
264        match provider {
265            LlmProvider::OpenAI => "gpt-4o-mini".to_string(),
266            LlmProvider::Anthropic => "claude-3-haiku-20240307".to_string(),
267            LlmProvider::Groq => "llama-3.1-8b-instant".to_string(),
268            LlmProvider::OpenRouter => "openai/gpt-3.5-turbo".to_string(),
269            LlmProvider::HuggingFace => "microsoft/DialoGPT-medium".to_string(),
270            LlmProvider::Custom => "custom-model".to_string(),
271            LlmProvider::Auto => "auto".to_string(),
272        }
273    }
274
275    /// Create concise system prompt focused on clean, scannable output
276    fn create_system_prompt(&self) -> String {
277        r#"You are a concise technical documentation assistant. Provide clear, scannable answers based ONLY on the provided search results.
278
279RESPONSE FORMAT:
2801. **Quick Answer** (1-2 sentences max)
2812. **Key Points** (bullet points, max 4 items)  
2823. **Code Example** (if available - keep it short and practical)
283
284RULES:
285- Be extremely concise and scannable
286- Use bullet points and short paragraphs
287- Only include essential information
288- Cite sources as [Source N] 
289- Never add information not in the sources
290- Focus on what developers need to know immediately
291
292STYLE:
293- Write for busy developers who want quick answers
294- Use clear, simple language
295- Keep code examples minimal but complete
296- Prioritize readability over completeness"#.to_string()
297    }
298
299    /// Create user prompt with query and search results
300    fn create_user_prompt(&self, query: &str, results: &[RagSearchResult]) -> String {
301        let mut prompt = format!("Question: {}\n\nSearch Results:\n\n", query);
302
303        for (i, result) in results.iter().enumerate() {
304            prompt.push_str(&format!(
305                "[Source {}] {}\nURL: {}\nContent: {}\n\n",
306                i + 1,
307                result.title.as_ref().unwrap_or(&"Untitled".to_string()),
308                result.source_path.to_string_lossy(),
309                result.content.chars().take(1000).collect::<String>()
310            ));
311        }
312
313        prompt.push_str("\nPlease provide a comprehensive answer based on these search results.");
314        prompt
315    }
316
317    /// Extract the actual answer from responses that may contain thinking content
318    fn extract_final_answer(&self, response_text: &str) -> String {
319        // Handle models with thinking capabilities - check for both <thinking> and <think> tags
320        if response_text.contains("<thinking>") && response_text.contains("</thinking>") {
321            // Find the end of the thinking section
322            if let Some(thinking_end) = response_text.find("</thinking>") {
323                let after_thinking = &response_text[thinking_end + "</thinking>".len()..];
324                return after_thinking.trim().to_string();
325            }
326        }
327
328        // Handle models that use <think> tags instead of <thinking>
329        if response_text.contains("<think>") && response_text.contains("</think>") {
330            // Find the end of the think section
331            if let Some(think_end) = response_text.find("</think>") {
332                let after_think = &response_text[think_end + "</think>".len()..];
333                return after_think.trim().to_string();
334            }
335        }
336
337        // Handle models that might use other thinking patterns
338        // Some models use patterns like "Let me think about this..." followed by the actual answer
339        if response_text.starts_with("Let me think") || response_text.starts_with("I need to think")
340        {
341            // Look for common transition phrases that indicate the start of the actual answer
342            let transition_phrases = [
343                "Here's my answer:",
344                "My answer is:",
345                "To answer your question:",
346                "Based on the search results:",
347                "The answer is:",
348                "\n\n**", // Common formatting transition
349                "\n\nQuick Answer:",
350                "\n\n##", // Markdown heading transition
351            ];
352
353            for phrase in &transition_phrases {
354                if let Some(pos) = response_text.find(phrase) {
355                    let answer_start = if phrase.starts_with('\n') {
356                        pos + 2 // Skip the newlines
357                    } else {
358                        pos + phrase.len()
359                    };
360                    return response_text[answer_start..].trim().to_string();
361                }
362            }
363        }
364
365        // For other models or no thinking pattern detected, return the full response
366        response_text.to_string()
367    }
368
369    /// Extract citations from LLM response
370    fn extract_citations(&self, response_text: &str, results: &[RagSearchResult]) -> Vec<Citation> {
371        let mut citations = Vec::new();
372
373        // Simple citation extraction - look for [Source N] patterns
374        for (i, result) in results.iter().enumerate() {
375            let source_ref = format!("[Source {}]", i + 1);
376            if response_text.contains(&source_ref) {
377                citations.push(Citation {
378                    source_id: result.id.clone(),
379                    source_title: result
380                        .title
381                        .clone()
382                        .unwrap_or_else(|| "Untitled".to_string()),
383                    source_url: Some(result.source_path.to_string_lossy().to_string()),
384                    relevance_score: result.score,
385                    excerpt: result.content.chars().take(200).collect(),
386                });
387            }
388        }
389
390        citations
391    }
392
393    /// OpenAI GPT integration with streaming support
394    async fn synthesize_with_openai(
395        &self,
396        query: &str,
397        results: &[RagSearchResult],
398    ) -> Result<LlmResponse> {
399        let api_key = self
400            .config
401            .openai_api_key
402            .as_ref()
403            .ok_or_else(|| anyhow!("OpenAI API key not configured"))?;
404
405        let model = self.get_model_name(&LlmProvider::OpenAI);
406        let system_prompt = self.create_system_prompt();
407        let user_prompt = self.create_user_prompt(query, results);
408
409        let payload = serde_json::json!({
410            "model": model,
411            "messages": [
412                {
413                    "role": "system",
414                    "content": system_prompt
415                },
416                {
417                    "role": "user",
418                    "content": user_prompt
419                }
420            ],
421            "max_tokens": self.config.max_tokens,
422            "temperature": self.config.temperature,
423            "stream": self.config.streaming
424        });
425
426        let response = self
427            .http_client
428            .post("https://api.openai.com/v1/chat/completions")
429            .header("Authorization", format!("Bearer {}", api_key))
430            .header("Content-Type", "application/json")
431            .json(&payload)
432            .send()
433            .await?;
434
435        if !response.status().is_success() {
436            let error_text = response.text().await?;
437            return Err(anyhow!("OpenAI API error: {}", error_text));
438        }
439
440        let response_json: serde_json::Value = response.json().await?;
441
442        let raw_answer = response_json["choices"][0]["message"]["content"]
443            .as_str()
444            .ok_or_else(|| anyhow!("Invalid OpenAI response format"))?;
445        let answer = self.extract_final_answer(raw_answer);
446
447        let usage = &response_json["usage"];
448        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
449        let finish_reason = response_json["choices"][0]["finish_reason"]
450            .as_str()
451            .map(|s| s.to_string());
452
453        let citations = self.extract_citations(&answer, results);
454
455        Ok(LlmResponse {
456            answer,
457            sources_used: results.iter().map(|r| r.id.clone()).collect(),
458            confidence: Some(0.9), // OpenAI typically high confidence
459            provider_used: LlmProvider::OpenAI,
460            model_used: model,
461            tokens_used,
462            response_time_ms: 0, // Will be set by caller
463            finish_reason,
464            citations,
465        })
466    }
467
468    /// Anthropic Claude integration with function calling support
469    async fn synthesize_with_anthropic(
470        &self,
471        query: &str,
472        results: &[RagSearchResult],
473    ) -> Result<LlmResponse> {
474        let api_key = self
475            .config
476            .anthropic_api_key
477            .as_ref()
478            .ok_or_else(|| anyhow!("Anthropic API key not configured"))?;
479
480        let model = self.get_model_name(&LlmProvider::Anthropic);
481        let system_prompt = self.create_system_prompt();
482        let user_prompt = self.create_user_prompt(query, results);
483
484        let payload = serde_json::json!({
485            "model": model,
486            "max_tokens": self.config.max_tokens,
487            "temperature": self.config.temperature,
488            "system": system_prompt,
489            "messages": [
490                {
491                    "role": "user",
492                    "content": user_prompt
493                }
494            ]
495        });
496
497        let response = self
498            .http_client
499            .post("https://api.anthropic.com/v1/messages")
500            .header("x-api-key", api_key)
501            .header("content-type", "application/json")
502            .header("anthropic-version", "2023-06-01")
503            .json(&payload)
504            .send()
505            .await?;
506
507        if !response.status().is_success() {
508            let error_text = response.text().await?;
509            return Err(anyhow!("Anthropic API error: {}", error_text));
510        }
511
512        let response_json: serde_json::Value = response.json().await?;
513
514        let raw_answer = response_json["content"][0]["text"]
515            .as_str()
516            .ok_or_else(|| anyhow!("Invalid Anthropic response format"))?;
517        let answer = self.extract_final_answer(raw_answer);
518
519        let usage = &response_json["usage"];
520        let tokens_used = usage["output_tokens"].as_u64().map(|t| t as u32);
521        let finish_reason = response_json["stop_reason"].as_str().map(|s| s.to_string());
522
523        let citations = self.extract_citations(&answer, results);
524
525        Ok(LlmResponse {
526            answer,
527            sources_used: results.iter().map(|r| r.id.clone()).collect(),
528            confidence: Some(0.85), // Claude typically good confidence
529            provider_used: LlmProvider::Anthropic,
530            model_used: model,
531            tokens_used,
532            response_time_ms: 0,
533            finish_reason,
534            citations,
535        })
536    }
537
538    /// Groq fast inference integration for ultra-fast responses
539    async fn synthesize_with_groq(
540        &self,
541        query: &str,
542        results: &[RagSearchResult],
543    ) -> Result<LlmResponse> {
544        let api_key = self
545            .config
546            .groq_api_key
547            .as_ref()
548            .ok_or_else(|| anyhow!("Groq API key not configured"))?;
549
550        let model = self.get_model_name(&LlmProvider::Groq);
551        let system_prompt = self.create_system_prompt();
552        let user_prompt = self.create_user_prompt(query, results);
553
554        let payload = serde_json::json!({
555            "model": model,
556            "messages": [
557                {
558                    "role": "system",
559                    "content": system_prompt
560                },
561                {
562                    "role": "user",
563                    "content": user_prompt
564                }
565            ],
566            "max_tokens": self.config.max_tokens,
567            "temperature": self.config.temperature,
568            "stream": false
569        });
570
571        let response = self
572            .http_client
573            .post("https://api.groq.com/openai/v1/chat/completions")
574            .header("Authorization", format!("Bearer {}", api_key))
575            .header("Content-Type", "application/json")
576            .json(&payload)
577            .send()
578            .await?;
579
580        if !response.status().is_success() {
581            let status = response.status();
582            let error_text = response.text().await?;
583            log::error!(
584                "Groq API error - Status: {}, Response: {}",
585                status,
586                error_text
587            );
588            return Err(anyhow!("Groq API error ({}): {}", status, error_text));
589        }
590
591        let response_json: serde_json::Value = response.json().await?;
592
593        let raw_answer = response_json["choices"][0]["message"]["content"]
594            .as_str()
595            .ok_or_else(|| anyhow!("Invalid Groq response format"))?;
596        let answer = self.extract_final_answer(raw_answer);
597
598        let usage = &response_json["usage"];
599        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
600        let finish_reason = response_json["choices"][0]["finish_reason"]
601            .as_str()
602            .map(|s| s.to_string());
603
604        let citations = self.extract_citations(&answer, results);
605
606        Ok(LlmResponse {
607            answer,
608            sources_used: results.iter().map(|r| r.id.clone()).collect(),
609            confidence: Some(0.8), // Groq usually good quality
610            provider_used: LlmProvider::Groq,
611            model_used: model,
612            tokens_used,
613            response_time_ms: 0,
614            finish_reason,
615            citations,
616        })
617    }
618
619    /// OpenRouter multi-model gateway for access to multiple providers
620    async fn synthesize_with_openrouter(
621        &self,
622        query: &str,
623        results: &[RagSearchResult],
624    ) -> Result<LlmResponse> {
625        let api_key = self
626            .config
627            .openrouter_api_key
628            .as_ref()
629            .ok_or_else(|| anyhow!("OpenRouter API key not configured"))?;
630
631        let model = self.get_model_name(&LlmProvider::OpenRouter);
632        let system_prompt = self.create_system_prompt();
633        let user_prompt = self.create_user_prompt(query, results);
634
635        let payload = serde_json::json!({
636            "model": model,
637            "messages": [
638                {
639                    "role": "system",
640                    "content": system_prompt
641                },
642                {
643                    "role": "user",
644                    "content": user_prompt
645                }
646            ],
647            "max_tokens": self.config.max_tokens,
648            "temperature": self.config.temperature,
649            "stream": self.config.streaming
650        });
651
652        let response = self
653            .http_client
654            .post("https://openrouter.ai/api/v1/chat/completions")
655            .header("Authorization", format!("Bearer {}", api_key))
656            .header("Content-Type", "application/json")
657            .header("HTTP-Referer", "https://github.com/neur0map/manx")
658            .header("X-Title", "Manx Documentation Finder")
659            .json(&payload)
660            .send()
661            .await?;
662
663        if !response.status().is_success() {
664            let error_text = response.text().await?;
665            return Err(anyhow!("OpenRouter API error: {}", error_text));
666        }
667
668        let response_json: serde_json::Value = response.json().await?;
669
670        let raw_answer = response_json["choices"][0]["message"]["content"]
671            .as_str()
672            .ok_or_else(|| anyhow!("Invalid OpenRouter response format"))?;
673        let answer = self.extract_final_answer(raw_answer);
674
675        let usage = &response_json["usage"];
676        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
677        let finish_reason = response_json["choices"][0]["finish_reason"]
678            .as_str()
679            .map(|s| s.to_string());
680
681        let citations = self.extract_citations(&answer, results);
682
683        Ok(LlmResponse {
684            answer,
685            sources_used: results.iter().map(|r| r.id.clone()).collect(),
686            confidence: Some(0.82), // Varies by underlying model
687            provider_used: LlmProvider::OpenRouter,
688            model_used: model,
689            tokens_used,
690            response_time_ms: 0,
691            finish_reason,
692            citations,
693        })
694    }
695
696    /// HuggingFace Router API for open-source models
697    async fn synthesize_with_huggingface(
698        &self,
699        query: &str,
700        results: &[RagSearchResult],
701    ) -> Result<LlmResponse> {
702        let api_key = self
703            .config
704            .huggingface_api_key
705            .as_ref()
706            .ok_or_else(|| anyhow!("HuggingFace API key not configured"))?;
707
708        let model = self.get_model_name(&LlmProvider::HuggingFace);
709        let system_prompt = self.create_system_prompt();
710        let user_prompt = self.create_user_prompt(query, results);
711
712        // Use OpenAI-compatible chat completions format
713        let payload = serde_json::json!({
714            "model": model,
715            "messages": [
716                {"role": "system", "content": system_prompt},
717                {"role": "user", "content": user_prompt}
718            ],
719            "max_tokens": self.config.max_tokens,
720            "temperature": self.config.temperature
721        });
722
723        let response = self
724            .http_client
725            .post("https://router.huggingface.co/v1/chat/completions")
726            .header("Authorization", format!("Bearer {}", api_key))
727            .header("Content-Type", "application/json")
728            .json(&payload)
729            .send()
730            .await?;
731
732        if !response.status().is_success() {
733            let error_text = response.text().await?;
734            return Err(anyhow!("HuggingFace API error: {}", error_text));
735        }
736
737        let response_json: serde_json::Value = response.json().await?;
738
739        let raw_answer = if let Some(choices) = response_json["choices"].as_array() {
740            if let Some(first_choice) = choices.first() {
741                if let Some(message) = first_choice["message"].as_object() {
742                    message["content"].as_str().unwrap_or("")
743                } else {
744                    return Err(anyhow!(
745                        "Invalid HuggingFace response format: missing message"
746                    ));
747                }
748            } else {
749                return Err(anyhow!(
750                    "Invalid HuggingFace response format: empty choices"
751                ));
752            }
753        } else {
754            return Err(anyhow!(
755                "Invalid HuggingFace response format: missing choices"
756            ));
757        };
758
759        let answer = self.extract_final_answer(raw_answer);
760
761        let citations = self.extract_citations(&answer, results);
762
763        Ok(LlmResponse {
764            answer,
765            sources_used: results.iter().map(|r| r.id.clone()).collect(),
766            confidence: Some(0.75), // Open source models vary
767            provider_used: LlmProvider::HuggingFace,
768            model_used: model,
769            tokens_used: response_json["usage"]["total_tokens"]
770                .as_u64()
771                .map(|t| t as u32),
772            response_time_ms: 0,
773            finish_reason: response_json["choices"][0]["finish_reason"]
774                .as_str()
775                .map(|s| s.to_string()),
776            citations,
777        })
778    }
779
780    /// Custom endpoint integration for self-hosted models
781    async fn synthesize_with_custom(
782        &self,
783        query: &str,
784        results: &[RagSearchResult],
785    ) -> Result<LlmResponse> {
786        let endpoint = self
787            .config
788            .custom_endpoint
789            .as_ref()
790            .ok_or_else(|| anyhow!("Custom endpoint not configured"))?;
791
792        let model = self.get_model_name(&LlmProvider::Custom);
793        let system_prompt = self.create_system_prompt();
794        let user_prompt = self.create_user_prompt(query, results);
795
796        // Use OpenAI-compatible format for custom endpoints
797        let payload = serde_json::json!({
798            "model": model,
799            "messages": [
800                {
801                    "role": "system",
802                    "content": system_prompt
803                },
804                {
805                    "role": "user",
806                    "content": user_prompt
807                }
808            ],
809            "max_tokens": self.config.max_tokens,
810            "temperature": self.config.temperature,
811            "stream": self.config.streaming
812        });
813
814        let response = self
815            .http_client
816            .post(format!("{}/v1/chat/completions", endpoint))
817            .header("Content-Type", "application/json")
818            .json(&payload)
819            .send()
820            .await?;
821
822        if !response.status().is_success() {
823            let error_text = response.text().await?;
824            return Err(anyhow!("Custom endpoint error: {}", error_text));
825        }
826
827        let response_json: serde_json::Value = response.json().await?;
828
829        let raw_answer = response_json["choices"][0]["message"]["content"]
830            .as_str()
831            .ok_or_else(|| anyhow!("Invalid custom endpoint response format"))?;
832        let answer = self.extract_final_answer(raw_answer);
833
834        let usage = &response_json["usage"];
835        let tokens_used = usage
836            .get("total_tokens")
837            .and_then(|t| t.as_u64())
838            .map(|t| t as u32);
839        let finish_reason = response_json["choices"][0]
840            .get("finish_reason")
841            .and_then(|r| r.as_str())
842            .map(|s| s.to_string());
843
844        let citations = self.extract_citations(&answer, results);
845
846        Ok(LlmResponse {
847            answer,
848            sources_used: results.iter().map(|r| r.id.clone()).collect(),
849            confidence: Some(0.8), // Assume reasonable confidence for custom
850            provider_used: LlmProvider::Custom,
851            model_used: model,
852            tokens_used,
853            response_time_ms: 0,
854            finish_reason,
855            citations,
856        })
857    }
858}
859
860#[cfg(test)]
861mod tests {
862    use super::*;
863
864    #[test]
865    fn test_extract_final_answer_with_thinking_tags() {
866        let client = LlmClient::new(LlmConfig::default()).unwrap();
867
868        let response_with_thinking = r#"<thinking>
869Let me analyze this query about Rust error handling.
870
871The user is asking about Result types and how to handle errors properly.
872I should explain the basics of Result<T, E> and common patterns.
873</thinking>
874
875**Quick Answer**
876Rust uses `Result<T, E>` for error handling, where `T` is the success type and `E` is the error type.
877
878**Key Points**
879- Use `?` operator for error propagation
880- `unwrap()` panics on error, avoid in production
881- `expect()` provides custom panic message
882- Pattern match with `match` for comprehensive handling"#;
883
884        let extracted = client.extract_final_answer(response_with_thinking);
885
886        assert!(!extracted.contains("<thinking>"));
887        assert!(!extracted.contains("</thinking>"));
888        assert!(extracted.contains("**Quick Answer**"));
889        assert!(extracted.contains("Result<T, E>"));
890    }
891
892    #[test]
893    fn test_extract_final_answer_with_think_tags() {
894        let client = LlmClient::new(LlmConfig::default()).unwrap();
895
896        let response_with_think = r#"<think>
897This question is about JavaScript async/await patterns.
898
899The user wants to understand how to handle asynchronous operations.
900I should provide clear examples and best practices.
901</think>
902
903**Quick Answer**
904Use `async/await` for handling asynchronous operations in JavaScript.
905
906**Key Points**
907- `async` functions return Promises
908- `await` pauses execution until Promise resolves
909- Use try/catch for error handling
910- Avoid callback hell with Promise chains"#;
911
912        let extracted = client.extract_final_answer(response_with_think);
913
914        assert!(!extracted.contains("<think>"));
915        assert!(!extracted.contains("</think>"));
916        assert!(extracted.contains("**Quick Answer**"));
917        assert!(extracted.contains("async/await"));
918    }
919
920    #[test]
921    fn test_extract_final_answer_without_thinking() {
922        let client = LlmClient::new(LlmConfig::default()).unwrap();
923
924        let normal_response = r#"**Quick Answer**
925This is a normal response without thinking tags.
926
927**Key Points**
928- Point 1
929- Point 2"#;
930
931        let extracted = client.extract_final_answer(normal_response);
932
933        assert_eq!(extracted, normal_response);
934    }
935
936    #[test]
937    fn test_extract_final_answer_with_thinking_prefix() {
938        let client = LlmClient::new(LlmConfig::default()).unwrap();
939
940        let response_with_prefix = r#"Let me think about this question carefully...
941
942I need to consider the different aspects of the query.
943
944Based on the search results:
945
946**Quick Answer**
947Here is the actual answer after thinking.
948
949**Key Points**
950- Important point 1
951- Important point 2"#;
952
953        let extracted = client.extract_final_answer(response_with_prefix);
954
955        assert!(!extracted.contains("Let me think"));
956        assert!(extracted.contains("**Quick Answer**"));
957        assert!(extracted.contains("Here is the actual answer"));
958    }
959}