manx_cli/web_search/
mod.rs

1//! Intelligent documentation search with DuckDuckGo + semantic embeddings
2//!
3//! This module provides official-first documentation search that:
4//! - Prioritizes official documentation sites by default
5//! - Falls back to trusted community sources with clear notifications
6//! - Uses semantic embeddings for relevance filtering
7//! - Optionally uses LLM for authenticity verification and summarization
8//! - Maintains privacy with anonymous DuckDuckGo searches
9
10use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod result_processor;
17pub mod search_engine;
18
19/// Configuration for documentation search
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct WebSearchConfig {
22    pub enabled: bool,
23    pub max_results: usize,
24    pub similarity_threshold: f32,
25    pub search_timeout_seconds: u64,
26    pub user_agent: String,
27    pub min_official_results: usize, // Minimum official results before fallback
28}
29
30impl Default for WebSearchConfig {
31    fn default() -> Self {
32        Self {
33            enabled: true,
34            max_results: 8,
35            similarity_threshold: 0.6,
36            search_timeout_seconds: 10,
37            user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
38                .to_string(),
39            min_official_results: 3,
40        }
41    }
42}
43
44/// Raw search result from DuckDuckGo
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RawSearchResult {
47    pub title: String,
48    pub url: String,
49    pub snippet: String,
50    pub source_domain: String,
51    pub timestamp: Option<DateTime<Utc>>,
52}
53
54/// Processed search result with relevance scoring
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ProcessedSearchResult {
57    pub title: String,
58    pub url: String,
59    pub snippet: String,
60    pub source_domain: String,
61    pub is_official: bool,
62    pub source_tier: u8, // 1=Official docs, 2=Official repos, 3=Trusted community, 4=General
63    pub similarity_score: f32,
64    pub final_score: f32, // Combined similarity + official boost
65    pub timestamp: Option<DateTime<Utc>>,
66}
67
68/// Final documentation search response
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct DocumentationSearchResponse {
71    pub query: String,
72    pub summary: String,
73    pub results: Vec<ProcessedSearchResult>,
74    pub official_results_count: usize,
75    pub used_fallback: bool,
76    pub total_found: usize,
77    pub search_time_ms: u64,
78    pub sources: Vec<String>,
79    pub used_llm_verification: bool,
80    pub verification_passed: Option<bool>,
81}
82
83/// LLM verification response for search authenticity
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct VerificationResult {
86    pub is_authentic: bool,
87    pub confidence: f32,
88    pub reasoning: String,
89    pub suggested_refinement: Option<String>, // If search should be refined
90}
91
92/// Documentation search system
93pub struct DocumentationSearchSystem {
94    config: WebSearchConfig,
95    embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
96    llm_client: Option<crate::rag::llm::LlmClient>,
97    official_sources: official_sources::OfficialSourceManager,
98}
99
100impl DocumentationSearchSystem {
101    /// Create new documentation search system
102    pub async fn new(
103        config: WebSearchConfig,
104        llm_config: Option<crate::rag::llm::LlmConfig>,
105    ) -> Result<Self> {
106        if !config.enabled {
107            return Err(anyhow!("Documentation search is disabled"));
108        }
109
110        // Initialize semantic embeddings for similarity scoring
111        let embedding_model = match crate::rag::embeddings::EmbeddingModel::new().await {
112            Ok(model) => {
113                log::info!("Semantic embeddings initialized for search");
114                Some(model)
115            }
116            Err(e) => {
117                log::warn!(
118                    "Semantic embeddings unavailable, using text matching: {}",
119                    e
120                );
121                None
122            }
123        };
124
125        // Initialize LLM client if configured
126        let llm_client = if let Some(llm_cfg) = llm_config {
127            match crate::rag::llm::LlmClient::new(llm_cfg) {
128                Ok(client) => {
129                    log::info!("LLM client initialized for result verification");
130                    Some(client)
131                }
132                Err(e) => {
133                    log::warn!("LLM client unavailable: {}", e);
134                    None
135                }
136            }
137        } else {
138            None
139        };
140
141        let official_sources = official_sources::OfficialSourceManager::new();
142
143        Ok(Self {
144            config,
145            embedding_model,
146            llm_client,
147            official_sources,
148        })
149    }
150
151    /// Search for documentation with official-first strategy
152    pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
153        let start_time = std::time::Instant::now();
154
155        log::info!("🔍 Searching official documentation for: {}", query);
156
157        // Step 1: Search official documentation sites first
158        let official_query = self.official_sources.build_official_query(query);
159        let mut all_results = search_engine::search_duckduckgo(
160            &official_query,
161            self.config.max_results,
162            &self.config.user_agent,
163            self.config.search_timeout_seconds,
164        )
165        .await?;
166
167        let mut used_fallback = false;
168
169        // Step 2: Check if we have enough official results
170        let official_results_count = all_results
171            .iter()
172            .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
173            .count();
174
175        // Step 3: Fallback to general search if insufficient official results
176        if official_results_count < self.config.min_official_results {
177            log::info!(
178                "⚠️ Only {} official results found, expanding search...",
179                official_results_count
180            );
181            used_fallback = true;
182
183            // Search without site restrictions
184            let fallback_results = search_engine::search_duckduckgo(
185                query,
186                self.config.max_results,
187                &self.config.user_agent,
188                self.config.search_timeout_seconds,
189            )
190            .await?;
191
192            // Merge results, avoiding duplicates
193            for result in fallback_results {
194                if !all_results.iter().any(|r| r.url == result.url) {
195                    all_results.push(result);
196                }
197            }
198        }
199
200        if all_results.is_empty() {
201            return Ok(DocumentationSearchResponse {
202                query: query.to_string(),
203                summary: "No relevant documentation found".to_string(),
204                results: vec![],
205                official_results_count: 0,
206                used_fallback: false,
207                total_found: 0,
208                search_time_ms: start_time.elapsed().as_millis() as u64,
209                sources: vec![],
210                used_llm_verification: false,
211                verification_passed: None,
212            });
213        }
214
215        // Step 4: Process results with semantic filtering and official source ranking
216        let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
217            result_processor::process_with_embeddings(
218                query,
219                &all_results,
220                embedding_model,
221                &self.official_sources,
222                self.config.similarity_threshold,
223            )
224            .await?
225        } else {
226            result_processor::process_without_embeddings(
227                query,
228                &all_results,
229                &self.official_sources,
230            )
231        };
232
233        // Step 4a: Enhance results with additional metadata
234        result_processor::enhance_results(&mut processed_results, &self.official_sources);
235
236        // Log tier information for debugging
237        for result in &processed_results {
238            let tier = self
239                .official_sources
240                .get_source_tier(&result.source_domain, &result.url);
241            log::debug!(
242                "Source: {} - Tier: {} - Score: {}",
243                result.source_domain,
244                self.official_sources.get_tier_description(&tier),
245                result.final_score
246            );
247        }
248
249        // Step 4b: Filter out low-quality results
250        processed_results = result_processor::filter_quality_results(processed_results, 30);
251
252        // Step 4c: Remove duplicates
253        let processed_results = result_processor::deduplicate_results(processed_results);
254
255        // Step 5: LLM verification if available
256        let verification_result = if let Some(ref llm_client) = self.llm_client {
257            if llm_client.is_available() {
258                log::info!("Verifying results with LLM");
259                match llm_verifier::verify_search_results(query, &processed_results, llm_client)
260                    .await
261                {
262                    Ok(verification) => Some(verification),
263                    Err(e) => {
264                        log::warn!("LLM verification failed: {}", e);
265                        None
266                    }
267                }
268            } else {
269                None
270            }
271        } else {
272            None
273        };
274
275        // Step 6: Generate summary
276        let summary = self.generate_summary(query, &processed_results).await?;
277
278        // Calculate final stats
279        let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
280
281        let sources: Vec<String> = processed_results
282            .iter()
283            .map(|r| r.source_domain.clone())
284            .collect::<std::collections::HashSet<_>>()
285            .into_iter()
286            .collect();
287
288        let search_time = start_time.elapsed().as_millis() as u64;
289
290        Ok(DocumentationSearchResponse {
291            query: query.to_string(),
292            summary,
293            results: processed_results,
294            official_results_count: final_official_count,
295            used_fallback,
296            total_found: all_results.len(),
297            search_time_ms: search_time,
298            sources,
299            used_llm_verification: verification_result.is_some(),
300            verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
301        })
302    }
303
304    /// Generate concise summary without AI fluff
305    async fn generate_summary(
306        &self,
307        query: &str,
308        results: &[ProcessedSearchResult],
309    ) -> Result<String> {
310        if results.is_empty() {
311            return Ok("No relevant documentation found".to_string());
312        }
313
314        // Use LLM for intelligent summarization if available
315        if let Some(ref llm_client) = self.llm_client {
316            if llm_client.is_available() {
317                let _context = results
318                    .iter()
319                    .take(3) // Top 3 most relevant
320                    .map(|r| {
321                        format!(
322                            "Source: {} ({})\nContent: {}",
323                            r.source_domain,
324                            if r.is_official {
325                                "Official"
326                            } else {
327                                "Community"
328                            },
329                            r.snippet
330                        )
331                    })
332                    .collect::<Vec<_>>()
333                    .join("\n\n");
334
335                // Create mock search results for LLM synthesis
336                let mock_results: Vec<crate::rag::RagSearchResult> = results
337                    .iter()
338                    .take(3)
339                    .map(|r| crate::rag::RagSearchResult {
340                        id: r.url.clone(),
341                        content: r.snippet.clone(),
342                        source_path: std::path::PathBuf::from(&r.url),
343                        source_type: if r.is_official {
344                            crate::rag::SourceType::Curated
345                        } else {
346                            crate::rag::SourceType::Remote
347                        },
348                        title: Some(r.title.clone()),
349                        section: None,
350                        score: r.final_score,
351                        metadata: crate::rag::DocumentMetadata {
352                            file_type: "web".to_string(),
353                            size: r.snippet.len() as u64,
354                            modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
355                            tags: vec!["documentation".to_string()],
356                            language: Some("en".to_string()),
357                        },
358                    })
359                    .collect();
360
361                match llm_client.synthesize_answer(query, &mock_results).await {
362                    Ok(response) => return Ok(response.answer),
363                    Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
364                }
365            }
366        }
367
368        // Fallback: Generate summary from top results
369        let official_count = results.iter().filter(|r| r.is_official).count();
370        let summary_prefix = if official_count > 0 {
371            format!("From {} official sources", official_count)
372        } else {
373            "From community sources".to_string()
374        };
375
376        let top_content = results
377            .iter()
378            .take(2)
379            .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
380            .collect::<Vec<_>>()
381            .join(". ");
382
383        Ok(format!("{}: {}", summary_prefix, top_content))
384    }
385
386    /// Check if system is ready for searches
387    pub fn is_available(&self) -> bool {
388        self.config.enabled
389    }
390
391    /// Get configuration
392    pub fn config(&self) -> &WebSearchConfig {
393        &self.config
394    }
395}