Skip to main content

manx_cli/rag/
llm.rs

1//! Multi-provider LLM integration for answer synthesis
2//!
3//! Supports OpenAI GPT, Anthropic Claude, Groq, OpenRouter, HuggingFace, Z.AI, and custom endpoints
4//! with automatic failover and comprehensive error handling.
5
6use anyhow::{anyhow, Result};
7use serde::{Deserialize, Serialize};
8
9use crate::rag::RagSearchResult;
10
11/// Configuration for LLM integration supporting multiple providers
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct LlmConfig {
14    pub openai_api_key: Option<String>,
15    pub anthropic_api_key: Option<String>,
16    pub groq_api_key: Option<String>,
17    pub openrouter_api_key: Option<String>,
18    pub huggingface_api_key: Option<String>,
19    pub zai_api_key: Option<String>,
20    pub custom_endpoint: Option<String>,
21    pub preferred_provider: LlmProvider,
22    pub fallback_providers: Vec<LlmProvider>,
23    pub timeout_seconds: u64,
24    pub max_tokens: u32,
25    pub temperature: f32,
26    pub model_name: Option<String>,
27    pub streaming: bool,
28}
29
30impl Default for LlmConfig {
31    fn default() -> Self {
32        Self {
33            openai_api_key: None,
34            anthropic_api_key: None,
35            groq_api_key: None,
36            openrouter_api_key: None,
37            huggingface_api_key: None,
38            zai_api_key: None,
39            custom_endpoint: None,
40            preferred_provider: LlmProvider::Auto,
41            fallback_providers: vec![
42                LlmProvider::OpenAI,
43                LlmProvider::Anthropic,
44                LlmProvider::Groq,
45                LlmProvider::OpenRouter,
46                LlmProvider::Zai,
47            ],
48            timeout_seconds: 30,
49            max_tokens: 1000,
50            temperature: 0.1,
51            model_name: None,
52            streaming: false,
53        }
54    }
55}
56
57/// Available LLM providers
58#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
59pub enum LlmProvider {
60    Auto,
61    OpenAI,
62    Anthropic,
63    Groq,
64    OpenRouter,
65    HuggingFace,
66    Zai,
67    Custom,
68}
69
70/// LLM response with comprehensive metadata
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct LlmResponse {
73    pub answer: String,
74    pub sources_used: Vec<String>,
75    pub confidence: Option<f32>,
76    pub provider_used: LlmProvider,
77    pub model_used: String,
78    pub tokens_used: Option<u32>,
79    pub response_time_ms: u64,
80    pub finish_reason: Option<String>,
81    pub citations: Vec<Citation>,
82}
83
84/// Citation information linking to source documents
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct Citation {
87    pub source_id: String,
88    pub source_title: String,
89    pub source_url: Option<String>,
90    pub relevance_score: f32,
91    pub excerpt: String,
92}
93
94/// Multi-provider LLM client with automatic failover
95#[derive(Clone)]
96pub struct LlmClient {
97    pub(crate) config: LlmConfig,
98    pub(crate) http_client: reqwest::Client,
99}
100
101impl LlmClient {
102    /// Create a new LLM client with configuration
103    pub fn new(config: LlmConfig) -> Result<Self> {
104        let http_client = reqwest::Client::builder()
105            .timeout(std::time::Duration::from_secs(config.timeout_seconds))
106            .build()?;
107
108        Ok(Self {
109            config,
110            http_client,
111        })
112    }
113
114    /// Check if any LLM provider is available
115    pub fn is_available(&self) -> bool {
116        self.has_openai_key()
117            || self.has_anthropic_key()
118            || self.has_groq_key()
119            || self.has_openrouter_key()
120            || self.has_huggingface_key()
121            || self.has_zai_key()
122            || self.config.custom_endpoint.is_some()
123    }
124
125    /// Check availability of specific providers
126    pub fn has_openai_key(&self) -> bool {
127        self.config
128            .openai_api_key
129            .as_ref()
130            .is_some_and(|key| !key.is_empty())
131    }
132
133    pub fn has_anthropic_key(&self) -> bool {
134        self.config
135            .anthropic_api_key
136            .as_ref()
137            .is_some_and(|key| !key.is_empty())
138    }
139
140    pub fn has_groq_key(&self) -> bool {
141        self.config
142            .groq_api_key
143            .as_ref()
144            .is_some_and(|key| !key.is_empty())
145    }
146
147    pub fn has_openrouter_key(&self) -> bool {
148        self.config
149            .openrouter_api_key
150            .as_ref()
151            .is_some_and(|key| !key.is_empty())
152    }
153
154    pub fn has_huggingface_key(&self) -> bool {
155        self.config
156            .huggingface_api_key
157            .as_ref()
158            .is_some_and(|key| !key.is_empty())
159    }
160
161    pub fn has_zai_key(&self) -> bool {
162        self.config
163            .zai_api_key
164            .as_ref()
165            .is_some_and(|key| !key.is_empty())
166    }
167
168    /// Get the best available provider based on configuration and API key availability
169    pub fn get_best_provider(&self) -> Option<LlmProvider> {
170        if self.config.preferred_provider != LlmProvider::Auto {
171            // Check if preferred provider is available
172            if self.is_provider_available(&self.config.preferred_provider) {
173                return Some(self.config.preferred_provider.clone());
174            }
175        }
176
177        // Try fallback providers in order
178        for provider in &self.config.fallback_providers {
179            if self.is_provider_available(provider) {
180                return Some(provider.clone());
181            }
182        }
183
184        None
185    }
186
187    /// Check if a specific provider is available
188    pub fn is_provider_available(&self, provider: &LlmProvider) -> bool {
189        match provider {
190            LlmProvider::OpenAI => self.has_openai_key(),
191            LlmProvider::Anthropic => self.has_anthropic_key(),
192            LlmProvider::Groq => self.has_groq_key(),
193            LlmProvider::OpenRouter => self.has_openrouter_key(),
194            LlmProvider::HuggingFace => self.has_huggingface_key(),
195            LlmProvider::Zai => self.has_zai_key(),
196            LlmProvider::Custom => self.config.custom_endpoint.is_some(),
197            LlmProvider::Auto => false, // Auto is not a real provider
198        }
199    }
200
201    /// Synthesize an answer from search results using the best available provider
202    pub async fn synthesize_answer(
203        &self,
204        query: &str,
205        results: &[RagSearchResult],
206    ) -> Result<LlmResponse> {
207        let provider = self
208            .get_best_provider()
209            .ok_or_else(|| anyhow!("No LLM provider available"))?;
210
211        let start_time = std::time::Instant::now();
212
213        let response = match provider {
214            LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
215            LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
216            LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
217            LlmProvider::OpenRouter => self.synthesize_with_openrouter(query, results).await,
218            LlmProvider::HuggingFace => self.synthesize_with_huggingface(query, results).await,
219            LlmProvider::Zai => self.synthesize_with_zai(query, results).await,
220            LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
221            LlmProvider::Auto => unreachable!(),
222        };
223
224        // If primary provider fails, try fallback providers
225        match response {
226            Ok(mut resp) => {
227                resp.response_time_ms = start_time.elapsed().as_millis() as u64;
228                Ok(resp)
229            }
230            Err(e) => {
231                log::warn!("Primary provider {:?} failed: {}", provider, e);
232                self.try_fallback_providers(query, results, &provider).await
233            }
234        }
235    }
236
237    /// Try fallback providers if primary fails
238    async fn try_fallback_providers(
239        &self,
240        query: &str,
241        results: &[RagSearchResult],
242        failed_provider: &LlmProvider,
243    ) -> Result<LlmResponse> {
244        for provider in &self.config.fallback_providers {
245            if provider != failed_provider && self.is_provider_available(provider) {
246                log::info!("Trying fallback provider: {:?}", provider);
247
248                let start_time = std::time::Instant::now();
249                let response = match provider {
250                    LlmProvider::OpenAI => self.synthesize_with_openai(query, results).await,
251                    LlmProvider::Anthropic => self.synthesize_with_anthropic(query, results).await,
252                    LlmProvider::Groq => self.synthesize_with_groq(query, results).await,
253                    LlmProvider::OpenRouter => {
254                        self.synthesize_with_openrouter(query, results).await
255                    }
256                    LlmProvider::HuggingFace => {
257                        self.synthesize_with_huggingface(query, results).await
258                    }
259                    LlmProvider::Zai => self.synthesize_with_zai(query, results).await,
260                    LlmProvider::Custom => self.synthesize_with_custom(query, results).await,
261                    LlmProvider::Auto => continue,
262                };
263
264                if let Ok(mut resp) = response {
265                    resp.response_time_ms = start_time.elapsed().as_millis() as u64;
266                    return Ok(resp);
267                }
268            }
269        }
270
271        Err(anyhow!("All LLM providers failed"))
272    }
273
274    /// Get the appropriate model name for a provider
275    fn get_model_name(&self, provider: &LlmProvider) -> String {
276        if let Some(model) = &self.config.model_name {
277            return model.clone();
278        }
279
280        match provider {
281            LlmProvider::OpenAI => "gpt-4o-mini".to_string(),
282            LlmProvider::Anthropic => "claude-3-haiku-20240307".to_string(),
283            LlmProvider::Groq => "llama-3.1-8b-instant".to_string(),
284            LlmProvider::OpenRouter => "openai/gpt-3.5-turbo".to_string(),
285            LlmProvider::HuggingFace => "microsoft/DialoGPT-medium".to_string(),
286            LlmProvider::Zai => "glm-4.7".to_string(),
287            LlmProvider::Custom => "custom-model".to_string(),
288            LlmProvider::Auto => "auto".to_string(),
289        }
290    }
291
292    /// Create concise system prompt focused on clean, scannable output
293    fn create_system_prompt(&self) -> String {
294        r#"You are a concise technical documentation assistant. Provide clear, scannable answers based ONLY on the provided search results.
295
296RESPONSE FORMAT:
2971. **Quick Answer** (1-2 sentences max)
2982. **Key Points** (bullet points, max 4 items)  
2993. **Code Example** (if available - keep it short and practical)
300
301RULES:
302- Be extremely concise and scannable
303- Use bullet points and short paragraphs
304- Only include essential information
305- Cite sources as [Source N] 
306- Never add information not in the sources
307- Focus on what developers need to know immediately
308
309STYLE:
310- Write for busy developers who want quick answers
311- Use clear, simple language
312- Keep code examples minimal but complete
313- Prioritize readability over completeness"#.to_string()
314    }
315
316    /// Create user prompt with query and search results
317    fn create_user_prompt(&self, query: &str, results: &[RagSearchResult]) -> String {
318        let mut prompt = format!("Question: {}\n\nSearch Results:\n\n", query);
319
320        for (i, result) in results.iter().enumerate() {
321            prompt.push_str(&format!(
322                "[Source {}] {}\nURL: {}\nContent: {}\n\n",
323                i + 1,
324                result.title.as_ref().unwrap_or(&"Untitled".to_string()),
325                result.source_path.to_string_lossy(),
326                result.content.chars().take(1000).collect::<String>()
327            ));
328        }
329
330        prompt.push_str("\nPlease provide a comprehensive answer based on these search results.");
331        prompt
332    }
333
334    /// Extract the actual answer from responses that may contain thinking content (instance)
335    fn extract_final_answer(&self, response_text: &str) -> String {
336        Self::extract_final_answer_text(response_text)
337    }
338
339    /// Extract the actual answer from responses that may contain thinking content (static)
340    pub(crate) fn extract_final_answer_text(response_text: &str) -> String {
341        // Handle models with thinking capabilities - check for both <thinking> and <think> tags
342        if response_text.contains("<thinking>") && response_text.contains("</thinking>") {
343            // Find the end of the thinking section
344            if let Some(thinking_end) = response_text.find("</thinking>") {
345                let after_thinking = &response_text[thinking_end + "</thinking>".len()..];
346                return after_thinking.trim().to_string();
347            }
348        }
349
350        // Handle models that use <think> tags instead of <thinking>
351        if response_text.contains("<think>") && response_text.contains("</think>") {
352            // Find the end of the think section
353            if let Some(think_end) = response_text.find("</think>") {
354                let after_think = &response_text[think_end + "</think>".len()..];
355                return after_think.trim().to_string();
356            }
357        }
358
359        // Handle models that might use other thinking patterns
360        // Some models use patterns like "Let me think about this..." followed by the actual answer
361        if response_text.starts_with("Let me think") || response_text.starts_with("I need to think")
362        {
363            // Look for common transition phrases that indicate the start of the actual answer
364            let transition_phrases = [
365                "Here's my answer:",
366                "My answer is:",
367                "To answer your question:",
368                "Based on the search results:",
369                "The answer is:",
370                "\n\n**", // Common formatting transition
371                "\n\nQuick Answer:",
372                "\n\n##", // Markdown heading transition
373            ];
374
375            for phrase in &transition_phrases {
376                if let Some(pos) = response_text.find(phrase) {
377                    let answer_start = if phrase.starts_with('\n') {
378                        pos + 2 // Skip the newlines
379                    } else {
380                        pos + phrase.len()
381                    };
382                    return response_text[answer_start..].trim().to_string();
383                }
384            }
385        }
386
387        // For other models or no thinking pattern detected, return the full response
388        response_text.to_string()
389    }
390
391    /// Extract citations from LLM response
392    fn extract_citations(&self, response_text: &str, results: &[RagSearchResult]) -> Vec<Citation> {
393        let mut citations = Vec::new();
394
395        // Simple citation extraction - look for [Source N] patterns
396        for (i, result) in results.iter().enumerate() {
397            let source_ref = format!("[Source {}]", i + 1);
398            if response_text.contains(&source_ref) {
399                citations.push(Citation {
400                    source_id: result.id.clone(),
401                    source_title: result
402                        .title
403                        .clone()
404                        .unwrap_or_else(|| "Untitled".to_string()),
405                    source_url: Some(result.source_path.to_string_lossy().to_string()),
406                    relevance_score: result.score,
407                    excerpt: result.content.chars().take(200).collect(),
408                });
409            }
410        }
411
412        citations
413    }
414
415    /// OpenAI GPT integration with streaming support
416    async fn synthesize_with_openai(
417        &self,
418        query: &str,
419        results: &[RagSearchResult],
420    ) -> Result<LlmResponse> {
421        let api_key = self
422            .config
423            .openai_api_key
424            .as_ref()
425            .ok_or_else(|| anyhow!("OpenAI API key not configured"))?;
426
427        let model = self.get_model_name(&LlmProvider::OpenAI);
428        let system_prompt = self.create_system_prompt();
429        let user_prompt = self.create_user_prompt(query, results);
430
431        let payload = serde_json::json!({
432            "model": model,
433            "messages": [
434                {
435                    "role": "system",
436                    "content": system_prompt
437                },
438                {
439                    "role": "user",
440                    "content": user_prompt
441                }
442            ],
443            "max_tokens": self.config.max_tokens,
444            "temperature": self.config.temperature,
445            "stream": self.config.streaming
446        });
447
448        let response = self
449            .http_client
450            .post("https://api.openai.com/v1/chat/completions")
451            .header("Authorization", format!("Bearer {}", api_key))
452            .header("Content-Type", "application/json")
453            .json(&payload)
454            .send()
455            .await?;
456
457        if !response.status().is_success() {
458            let error_text = response.text().await?;
459            return Err(anyhow!("OpenAI API error: {}", error_text));
460        }
461
462        let response_json: serde_json::Value = response.json().await?;
463
464        let raw_answer = response_json["choices"][0]["message"]["content"]
465            .as_str()
466            .ok_or_else(|| anyhow!("Invalid OpenAI response format"))?;
467        let answer = self.extract_final_answer(raw_answer);
468
469        let usage = &response_json["usage"];
470        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
471        let finish_reason = response_json["choices"][0]["finish_reason"]
472            .as_str()
473            .map(|s| s.to_string());
474
475        let citations = self.extract_citations(&answer, results);
476
477        Ok(LlmResponse {
478            answer,
479            sources_used: results.iter().map(|r| r.id.clone()).collect(),
480            confidence: Some(0.9), // OpenAI typically high confidence
481            provider_used: LlmProvider::OpenAI,
482            model_used: model,
483            tokens_used,
484            response_time_ms: 0, // Will be set by caller
485            finish_reason,
486            citations,
487        })
488    }
489
490    /// Anthropic Claude integration with function calling support
491    async fn synthesize_with_anthropic(
492        &self,
493        query: &str,
494        results: &[RagSearchResult],
495    ) -> Result<LlmResponse> {
496        let api_key = self
497            .config
498            .anthropic_api_key
499            .as_ref()
500            .ok_or_else(|| anyhow!("Anthropic API key not configured"))?;
501
502        let model = self.get_model_name(&LlmProvider::Anthropic);
503        let system_prompt = self.create_system_prompt();
504        let user_prompt = self.create_user_prompt(query, results);
505
506        let payload = serde_json::json!({
507            "model": model,
508            "max_tokens": self.config.max_tokens,
509            "temperature": self.config.temperature,
510            "system": system_prompt,
511            "messages": [
512                {
513                    "role": "user",
514                    "content": user_prompt
515                }
516            ]
517        });
518
519        let response = self
520            .http_client
521            .post("https://api.anthropic.com/v1/messages")
522            .header("x-api-key", api_key)
523            .header("content-type", "application/json")
524            .header("anthropic-version", "2023-06-01")
525            .json(&payload)
526            .send()
527            .await?;
528
529        if !response.status().is_success() {
530            let error_text = response.text().await?;
531            return Err(anyhow!("Anthropic API error: {}", error_text));
532        }
533
534        let response_json: serde_json::Value = response.json().await?;
535
536        let raw_answer = response_json["content"][0]["text"]
537            .as_str()
538            .ok_or_else(|| anyhow!("Invalid Anthropic response format"))?;
539        let answer = self.extract_final_answer(raw_answer);
540
541        let usage = &response_json["usage"];
542        let tokens_used = usage["output_tokens"].as_u64().map(|t| t as u32);
543        let finish_reason = response_json["stop_reason"].as_str().map(|s| s.to_string());
544
545        let citations = self.extract_citations(&answer, results);
546
547        Ok(LlmResponse {
548            answer,
549            sources_used: results.iter().map(|r| r.id.clone()).collect(),
550            confidence: Some(0.85), // Claude typically good confidence
551            provider_used: LlmProvider::Anthropic,
552            model_used: model,
553            tokens_used,
554            response_time_ms: 0,
555            finish_reason,
556            citations,
557        })
558    }
559
560    /// Groq fast inference integration for ultra-fast responses
561    async fn synthesize_with_groq(
562        &self,
563        query: &str,
564        results: &[RagSearchResult],
565    ) -> Result<LlmResponse> {
566        let api_key = self
567            .config
568            .groq_api_key
569            .as_ref()
570            .ok_or_else(|| anyhow!("Groq API key not configured"))?;
571
572        let model = self.get_model_name(&LlmProvider::Groq);
573        let system_prompt = self.create_system_prompt();
574        let user_prompt = self.create_user_prompt(query, results);
575
576        let payload = serde_json::json!({
577            "model": model,
578            "messages": [
579                {
580                    "role": "system",
581                    "content": system_prompt
582                },
583                {
584                    "role": "user",
585                    "content": user_prompt
586                }
587            ],
588            "max_tokens": self.config.max_tokens,
589            "temperature": self.config.temperature,
590            "stream": false
591        });
592
593        let response = self
594            .http_client
595            .post("https://api.groq.com/openai/v1/chat/completions")
596            .header("Authorization", format!("Bearer {}", api_key))
597            .header("Content-Type", "application/json")
598            .json(&payload)
599            .send()
600            .await?;
601
602        if !response.status().is_success() {
603            let status = response.status();
604            let error_text = response.text().await?;
605            log::error!(
606                "Groq API error - Status: {}, Response: {}",
607                status,
608                error_text
609            );
610            return Err(anyhow!("Groq API error ({}): {}", status, error_text));
611        }
612
613        let response_json: serde_json::Value = response.json().await?;
614
615        let raw_answer = response_json["choices"][0]["message"]["content"]
616            .as_str()
617            .ok_or_else(|| anyhow!("Invalid Groq response format"))?;
618        let answer = self.extract_final_answer(raw_answer);
619
620        let usage = &response_json["usage"];
621        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
622        let finish_reason = response_json["choices"][0]["finish_reason"]
623            .as_str()
624            .map(|s| s.to_string());
625
626        let citations = self.extract_citations(&answer, results);
627
628        Ok(LlmResponse {
629            answer,
630            sources_used: results.iter().map(|r| r.id.clone()).collect(),
631            confidence: Some(0.8), // Groq usually good quality
632            provider_used: LlmProvider::Groq,
633            model_used: model,
634            tokens_used,
635            response_time_ms: 0,
636            finish_reason,
637            citations,
638        })
639    }
640
641    /// OpenRouter multi-model gateway for access to multiple providers
642    async fn synthesize_with_openrouter(
643        &self,
644        query: &str,
645        results: &[RagSearchResult],
646    ) -> Result<LlmResponse> {
647        let api_key = self
648            .config
649            .openrouter_api_key
650            .as_ref()
651            .ok_or_else(|| anyhow!("OpenRouter API key not configured"))?;
652
653        let model = self.get_model_name(&LlmProvider::OpenRouter);
654        let system_prompt = self.create_system_prompt();
655        let user_prompt = self.create_user_prompt(query, results);
656
657        let payload = serde_json::json!({
658            "model": model,
659            "messages": [
660                {
661                    "role": "system",
662                    "content": system_prompt
663                },
664                {
665                    "role": "user",
666                    "content": user_prompt
667                }
668            ],
669            "max_tokens": self.config.max_tokens,
670            "temperature": self.config.temperature,
671            "stream": self.config.streaming
672        });
673
674        let response = self
675            .http_client
676            .post("https://openrouter.ai/api/v1/chat/completions")
677            .header("Authorization", format!("Bearer {}", api_key))
678            .header("Content-Type", "application/json")
679            .header("HTTP-Referer", "https://github.com/neur0map/manx")
680            .header("X-Title", "Manx Documentation Finder")
681            .json(&payload)
682            .send()
683            .await?;
684
685        if !response.status().is_success() {
686            let error_text = response.text().await?;
687            return Err(anyhow!("OpenRouter API error: {}", error_text));
688        }
689
690        let response_json: serde_json::Value = response.json().await?;
691
692        let raw_answer = response_json["choices"][0]["message"]["content"]
693            .as_str()
694            .ok_or_else(|| anyhow!("Invalid OpenRouter response format"))?;
695        let answer = self.extract_final_answer(raw_answer);
696
697        let usage = &response_json["usage"];
698        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
699        let finish_reason = response_json["choices"][0]["finish_reason"]
700            .as_str()
701            .map(|s| s.to_string());
702
703        let citations = self.extract_citations(&answer, results);
704
705        Ok(LlmResponse {
706            answer,
707            sources_used: results.iter().map(|r| r.id.clone()).collect(),
708            confidence: Some(0.82), // Varies by underlying model
709            provider_used: LlmProvider::OpenRouter,
710            model_used: model,
711            tokens_used,
712            response_time_ms: 0,
713            finish_reason,
714            citations,
715        })
716    }
717
718    /// HuggingFace Router API for open-source models
719    async fn synthesize_with_huggingface(
720        &self,
721        query: &str,
722        results: &[RagSearchResult],
723    ) -> Result<LlmResponse> {
724        let api_key = self
725            .config
726            .huggingface_api_key
727            .as_ref()
728            .ok_or_else(|| anyhow!("HuggingFace API key not configured"))?;
729
730        let model = self.get_model_name(&LlmProvider::HuggingFace);
731        let system_prompt = self.create_system_prompt();
732        let user_prompt = self.create_user_prompt(query, results);
733
734        // Use OpenAI-compatible chat completions format
735        let payload = serde_json::json!({
736            "model": model,
737            "messages": [
738                {"role": "system", "content": system_prompt},
739                {"role": "user", "content": user_prompt}
740            ],
741            "max_tokens": self.config.max_tokens,
742            "temperature": self.config.temperature
743        });
744
745        let response = self
746            .http_client
747            .post("https://router.huggingface.co/v1/chat/completions")
748            .header("Authorization", format!("Bearer {}", api_key))
749            .header("Content-Type", "application/json")
750            .json(&payload)
751            .send()
752            .await?;
753
754        if !response.status().is_success() {
755            let error_text = response.text().await?;
756            return Err(anyhow!("HuggingFace API error: {}", error_text));
757        }
758
759        let response_json: serde_json::Value = response.json().await?;
760
761        let raw_answer = if let Some(choices) = response_json["choices"].as_array() {
762            if let Some(first_choice) = choices.first() {
763                if let Some(message) = first_choice["message"].as_object() {
764                    message["content"].as_str().unwrap_or("")
765                } else {
766                    return Err(anyhow!(
767                        "Invalid HuggingFace response format: missing message"
768                    ));
769                }
770            } else {
771                return Err(anyhow!(
772                    "Invalid HuggingFace response format: empty choices"
773                ));
774            }
775        } else {
776            return Err(anyhow!(
777                "Invalid HuggingFace response format: missing choices"
778            ));
779        };
780
781        let answer = self.extract_final_answer(raw_answer);
782
783        let citations = self.extract_citations(&answer, results);
784
785        Ok(LlmResponse {
786            answer,
787            sources_used: results.iter().map(|r| r.id.clone()).collect(),
788            confidence: Some(0.75), // Open source models vary
789            provider_used: LlmProvider::HuggingFace,
790            model_used: model,
791            tokens_used: response_json["usage"]["total_tokens"]
792                .as_u64()
793                .map(|t| t as u32),
794            response_time_ms: 0,
795            finish_reason: response_json["choices"][0]["finish_reason"]
796                .as_str()
797                .map(|s| s.to_string()),
798            citations,
799        })
800    }
801
802    /// Z.AI GLM Coding Plan integration
803    async fn synthesize_with_zai(
804        &self,
805        query: &str,
806        results: &[RagSearchResult],
807    ) -> Result<LlmResponse> {
808        let api_key = self
809            .config
810            .zai_api_key
811            .as_ref()
812            .ok_or_else(|| anyhow!("Z.AI API key not configured"))?;
813
814        let model = self.get_model_name(&LlmProvider::Zai);
815        let system_prompt = self.create_system_prompt();
816        let user_prompt = self.create_user_prompt(query, results);
817
818        let payload = serde_json::json!({
819            "model": model.to_uppercase(), // Z.AI expects uppercase model names
820            "messages": [
821                {
822                    "role": "system",
823                    "content": system_prompt
824                },
825                {
826                    "role": "user",
827                    "content": user_prompt
828                }
829            ],
830            "max_tokens": self.config.max_tokens,
831            "temperature": self.config.temperature,
832            "stream": false
833        });
834
835        let response = self
836            .http_client
837            .post("https://api.z.ai/api/coding/paas/v4/chat/completions")
838            .header("Authorization", format!("Bearer {}", api_key))
839            .header("Content-Type", "application/json")
840            .json(&payload)
841            .send()
842            .await?;
843
844        if !response.status().is_success() {
845            let error_text = response.text().await?;
846            return Err(anyhow!("Z.AI API error: {}", error_text));
847        }
848
849        let response_json: serde_json::Value = response.json().await?;
850
851        let raw_answer = response_json["choices"][0]["message"]["content"]
852            .as_str()
853            .ok_or_else(|| anyhow!("Invalid Z.AI response format"))?;
854        let answer = self.extract_final_answer(raw_answer);
855
856        let usage = &response_json["usage"];
857        let tokens_used = usage["total_tokens"].as_u64().map(|t| t as u32);
858        let finish_reason = response_json["choices"][0]["finish_reason"]
859            .as_str()
860            .map(|s| s.to_string());
861
862        let citations = self.extract_citations(&answer, results);
863
864        Ok(LlmResponse {
865            answer,
866            sources_used: results.iter().map(|r| r.id.clone()).collect(),
867            confidence: Some(0.88), // GLM-4.7 is high quality
868            provider_used: LlmProvider::Zai,
869            model_used: model,
870            tokens_used,
871            response_time_ms: 0,
872            finish_reason,
873            citations,
874        })
875    }
876
877    /// Custom endpoint integration for self-hosted models
878    async fn synthesize_with_custom(
879        &self,
880        query: &str,
881        results: &[RagSearchResult],
882    ) -> Result<LlmResponse> {
883        let endpoint = self
884            .config
885            .custom_endpoint
886            .as_ref()
887            .ok_or_else(|| anyhow!("Custom endpoint not configured"))?;
888
889        let model = self.get_model_name(&LlmProvider::Custom);
890        let system_prompt = self.create_system_prompt();
891        let user_prompt = self.create_user_prompt(query, results);
892
893        // Use OpenAI-compatible format for custom endpoints
894        let payload = serde_json::json!({
895            "model": model,
896            "messages": [
897                {
898                    "role": "system",
899                    "content": system_prompt
900                },
901                {
902                    "role": "user",
903                    "content": user_prompt
904                }
905            ],
906            "max_tokens": self.config.max_tokens,
907            "temperature": self.config.temperature,
908            "stream": self.config.streaming
909        });
910
911        let response = self
912            .http_client
913            .post(format!("{}/v1/chat/completions", endpoint))
914            .header("Content-Type", "application/json")
915            .json(&payload)
916            .send()
917            .await?;
918
919        if !response.status().is_success() {
920            let error_text = response.text().await?;
921            return Err(anyhow!("Custom endpoint error: {}", error_text));
922        }
923
924        let response_json: serde_json::Value = response.json().await?;
925
926        let raw_answer = response_json["choices"][0]["message"]["content"]
927            .as_str()
928            .ok_or_else(|| anyhow!("Invalid custom endpoint response format"))?;
929        let answer = self.extract_final_answer(raw_answer);
930
931        let usage = &response_json["usage"];
932        let tokens_used = usage
933            .get("total_tokens")
934            .and_then(|t| t.as_u64())
935            .map(|t| t as u32);
936        let finish_reason = response_json["choices"][0]
937            .get("finish_reason")
938            .and_then(|r| r.as_str())
939            .map(|s| s.to_string());
940
941        let citations = self.extract_citations(&answer, results);
942
943        Ok(LlmResponse {
944            answer,
945            sources_used: results.iter().map(|r| r.id.clone()).collect(),
946            confidence: Some(0.8), // Assume reasonable confidence for custom
947            provider_used: LlmProvider::Custom,
948            model_used: model,
949            tokens_used,
950            response_time_ms: 0,
951            finish_reason,
952            citations,
953        })
954    }
955}
956
957#[cfg(test)]
958mod tests {
959    use super::*;
960
961    #[test]
962    fn test_extract_final_answer_with_thinking_tags() {
963        let response_with_thinking = r#"<thinking>
964Let me analyze this query about Rust error handling.
965
966The user is asking about Result types and how to handle errors properly.
967I should explain the basics of Result<T, E> and common patterns.
968</thinking>
969
970**Quick Answer**
971Rust uses `Result<T, E>` for error handling, where `T` is the success type and `E` is the error type.
972
973**Key Points**
974- Use `?` operator for error propagation
975- `unwrap()` panics on error, avoid in production
976- `expect()` provides custom panic message
977- Pattern match with `match` for comprehensive handling"#;
978
979        let extracted = LlmClient::extract_final_answer_text(response_with_thinking);
980
981        assert!(!extracted.contains("<thinking>"));
982        assert!(!extracted.contains("</thinking>"));
983        assert!(extracted.contains("**Quick Answer**"));
984        assert!(extracted.contains("Result<T, E>"));
985    }
986
987    #[test]
988    fn test_extract_final_answer_with_think_tags() {
989        let response_with_think = r#"<think>
990This question is about JavaScript async/await patterns.
991
992The user wants to understand how to handle asynchronous operations.
993I should provide clear examples and best practices.
994</think>
995
996**Quick Answer**
997Use `async/await` for handling asynchronous operations in JavaScript.
998
999**Key Points**
1000- `async` functions return Promises
1001- `await` pauses execution until Promise resolves
1002- Use try/catch for error handling
1003- Avoid callback hell with Promise chains"#;
1004
1005        let extracted = LlmClient::extract_final_answer_text(response_with_think);
1006
1007        assert!(!extracted.contains("<think>"));
1008        assert!(!extracted.contains("</think>"));
1009        assert!(extracted.contains("**Quick Answer**"));
1010        assert!(extracted.contains("async/await"));
1011    }
1012
1013    #[test]
1014    fn test_extract_final_answer_without_thinking() {
1015        let normal_response = r#"**Quick Answer**
1016This is a normal response without thinking tags.
1017
1018**Key Points**
1019- Point 1
1020- Point 2"#;
1021
1022        let extracted = LlmClient::extract_final_answer_text(normal_response);
1023
1024        assert_eq!(extracted, normal_response);
1025    }
1026
1027    #[test]
1028    fn test_extract_final_answer_with_thinking_prefix() {
1029        let response_with_prefix = r#"Let me think about this question carefully...
1030
1031I need to consider the different aspects of the query.
1032
1033Based on the search results:
1034
1035**Quick Answer**
1036Here is the actual answer after thinking.
1037
1038**Key Points**
1039- Important point 1
1040- Important point 2"#;
1041
1042        let extracted = LlmClient::extract_final_answer_text(response_with_prefix);
1043
1044        assert!(!extracted.contains("Let me think"));
1045        assert!(extracted.contains("**Quick Answer**"));
1046        assert!(extracted.contains("Here is the actual answer"));
1047    }
1048}