manx_cli/web_search/
mod.rs

1//! Intelligent documentation search with DuckDuckGo + semantic embeddings
2//!
3//! This module provides official-first documentation search that:
4//! - Prioritizes official documentation sites by default
5//! - Falls back to trusted community sources with clear notifications
6//! - Uses semantic embeddings for relevance filtering
7//! - Optionally uses LLM for authenticity verification and summarization
8//! - Maintains privacy with anonymous DuckDuckGo searches
9
10use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod query_analyzer;
17pub mod result_processor;
18pub mod search_engine;
19
20/// Configuration for documentation search
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct WebSearchConfig {
23    pub enabled: bool,
24    pub max_results: usize,
25    pub similarity_threshold: f32,
26    pub search_timeout_seconds: u64,
27    pub user_agent: String,
28    pub min_official_results: usize, // Minimum official results before fallback
29}
30
31impl Default for WebSearchConfig {
32    fn default() -> Self {
33        Self {
34            enabled: true,
35            max_results: 8,
36            similarity_threshold: 0.6,
37            search_timeout_seconds: 10,
38            user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
39                .to_string(),
40            min_official_results: 3,
41        }
42    }
43}
44
45/// Raw search result from DuckDuckGo
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct RawSearchResult {
48    pub title: String,
49    pub url: String,
50    pub snippet: String,
51    pub source_domain: String,
52    pub timestamp: Option<DateTime<Utc>>,
53}
54
55/// Processed search result with relevance scoring
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct ProcessedSearchResult {
58    pub title: String,
59    pub url: String,
60    pub snippet: String,
61    pub source_domain: String,
62    pub is_official: bool,
63    pub source_tier: u8, // 1=Official docs, 2=Official repos, 3=Trusted community, 4=General
64    pub similarity_score: f32,
65    pub final_score: f32, // Combined similarity + official boost
66    pub timestamp: Option<DateTime<Utc>>,
67}
68
69/// Final documentation search response
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct DocumentationSearchResponse {
72    pub query: String,
73    pub summary: String,
74    pub results: Vec<ProcessedSearchResult>,
75    pub official_results_count: usize,
76    pub used_fallback: bool,
77    pub total_found: usize,
78    pub search_time_ms: u64,
79    pub sources: Vec<String>,
80    pub used_llm_verification: bool,
81    pub verification_passed: Option<bool>,
82}
83
84/// LLM verification response for search authenticity
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct VerificationResult {
87    pub is_authentic: bool,
88    pub confidence: f32,
89    pub reasoning: String,
90    pub suggested_refinement: Option<String>, // If search should be refined
91}
92
93/// Documentation search system
94pub struct DocumentationSearchSystem {
95    config: WebSearchConfig,
96    embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
97    llm_client: Option<crate::rag::llm::LlmClient>,
98    official_sources: official_sources::OfficialSourceManager,
99    query_analyzer: query_analyzer::QueryAnalyzer,
100}
101
102impl DocumentationSearchSystem {
103    /// Create new documentation search system
104    pub async fn new(
105        config: WebSearchConfig,
106        llm_config: Option<crate::rag::llm::LlmConfig>,
107    ) -> Result<Self> {
108        if !config.enabled {
109            return Err(anyhow!("Documentation search is disabled"));
110        }
111
112        // Initialize semantic embeddings for similarity scoring
113        let embedding_model = match crate::rag::embeddings::EmbeddingModel::new().await {
114            Ok(model) => {
115                log::info!("Semantic embeddings initialized for search");
116                Some(model)
117            }
118            Err(e) => {
119                log::warn!(
120                    "Semantic embeddings unavailable, using text matching: {}",
121                    e
122                );
123                None
124            }
125        };
126
127        // Initialize LLM client if configured
128        let llm_client = if let Some(llm_cfg) = llm_config {
129            match crate::rag::llm::LlmClient::new(llm_cfg) {
130                Ok(client) => {
131                    log::info!("LLM client initialized for result verification");
132                    Some(client)
133                }
134                Err(e) => {
135                    log::warn!("LLM client unavailable: {}", e);
136                    None
137                }
138            }
139        } else {
140            None
141        };
142
143        let official_sources = official_sources::OfficialSourceManager::new();
144        let query_analyzer = query_analyzer::QueryAnalyzer::new();
145
146        Ok(Self {
147            config,
148            embedding_model,
149            llm_client,
150            official_sources,
151            query_analyzer,
152        })
153    }
154
155    /// Search for documentation with official-first strategy
156    pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
157        let start_time = std::time::Instant::now();
158
159        log::info!("🔍 Searching official documentation for: {}", query);
160
161        // Step 0: Analyze query intent to enhance search strategy
162        let query_analysis = self
163            .query_analyzer
164            .analyze_query(query, self.llm_client.as_ref())
165            .await?;
166        log::info!(
167            "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
168            query_analysis.original_query,
169            query_analysis.enhanced_query,
170            query_analysis.confidence * 100.0
171        );
172
173        // Use enhanced query for better search results
174        let search_query = &query_analysis.enhanced_query;
175
176        // Step 1: Apply smart search strategy (only when LLM is available)
177        let official_query = if self.llm_client.is_some() {
178            match &query_analysis.search_strategy {
179                query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
180                    log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
181                    self.build_technical_search_query(search_query, sites)
182                }
183                query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
184                    log::info!(
185                        "📚 Using LLM-enhanced prioritized search for: {}",
186                        frameworks.join(", ")
187                    );
188                    self.build_dev_focused_query(search_query, frameworks)
189                }
190                _ => {
191                    if self.is_technical_query(&query_analysis) {
192                        log::info!("🔧 Using LLM-enhanced technical search");
193                        self.build_dev_focused_query(search_query, &[])
194                    } else {
195                        self.official_sources.build_official_query(search_query)
196                    }
197                }
198            }
199        } else {
200            // Default behavior when no LLM - unchanged original functionality
201            log::debug!("Using standard search (no LLM configured)");
202            self.official_sources.build_official_query(search_query)
203        };
204        let mut all_results = search_engine::search_duckduckgo(
205            &official_query,
206            self.config.max_results,
207            &self.config.user_agent,
208            self.config.search_timeout_seconds,
209        )
210        .await?;
211
212        let mut used_fallback = false;
213
214        // Step 2: Check if we have enough official results
215        let official_results_count = all_results
216            .iter()
217            .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
218            .count();
219
220        // Step 3: Fallback to general search if insufficient official results
221        if official_results_count < self.config.min_official_results {
222            log::info!(
223                "⚠️ Only {} official results found, expanding search...",
224                official_results_count
225            );
226            used_fallback = true;
227
228            // Search without site restrictions
229            let fallback_results = search_engine::search_duckduckgo(
230                query,
231                self.config.max_results,
232                &self.config.user_agent,
233                self.config.search_timeout_seconds,
234            )
235            .await?;
236
237            // Merge results, avoiding duplicates
238            for result in fallback_results {
239                if !all_results.iter().any(|r| r.url == result.url) {
240                    all_results.push(result);
241                }
242            }
243        }
244
245        if all_results.is_empty() {
246            return Ok(DocumentationSearchResponse {
247                query: query.to_string(),
248                summary: "No relevant documentation found".to_string(),
249                results: vec![],
250                official_results_count: 0,
251                used_fallback: false,
252                total_found: 0,
253                search_time_ms: start_time.elapsed().as_millis() as u64,
254                sources: vec![],
255                used_llm_verification: false,
256                verification_passed: None,
257            });
258        }
259
260        // Step 4: Process results with enhanced semantic filtering and query analysis
261        let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
262            result_processor::process_with_embeddings_and_analysis(
263                &query_analysis,
264                &all_results,
265                embedding_model,
266                &self.official_sources,
267                self.config.similarity_threshold,
268            )
269            .await?
270        } else {
271            result_processor::process_without_embeddings(
272                query,
273                &all_results,
274                &self.official_sources,
275            )
276        };
277
278        // Step 4a: Enhance results with additional metadata
279        result_processor::enhance_results(&mut processed_results, &self.official_sources);
280
281        // Log tier information for debugging
282        for result in &processed_results {
283            let tier = self
284                .official_sources
285                .get_source_tier(&result.source_domain, &result.url);
286            log::debug!(
287                "Source: {} - Tier: {} - Score: {}",
288                result.source_domain,
289                self.official_sources.get_tier_description(&tier),
290                result.final_score
291            );
292        }
293
294        // Step 4b: Filter out non-technical domains (LLM-enhanced only)
295        processed_results = result_processor::filter_non_technical_domains(
296            processed_results,
297            &query_analysis,
298            self.llm_client.is_some(),
299        );
300
301        // Step 4c: Filter out low-quality results
302        processed_results = result_processor::filter_quality_results(processed_results, 30);
303
304        // Step 4d: Remove duplicates
305        let processed_results = result_processor::deduplicate_results(processed_results);
306
307        // Step 5: LLM verification if available
308        let verification_result = if let Some(ref llm_client) = self.llm_client {
309            if llm_client.is_available() {
310                log::info!("Verifying results with LLM");
311                match llm_verifier::verify_search_results(query, &processed_results, llm_client)
312                    .await
313                {
314                    Ok(verification) => Some(verification),
315                    Err(e) => {
316                        log::warn!("LLM verification failed: {}", e);
317                        None
318                    }
319                }
320            } else {
321                None
322            }
323        } else {
324            None
325        };
326
327        // Step 6: Generate summary
328        let summary = self.generate_summary(query, &processed_results).await?;
329
330        // Calculate final stats
331        let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
332
333        let sources: Vec<String> = processed_results
334            .iter()
335            .map(|r| r.source_domain.clone())
336            .collect::<std::collections::HashSet<_>>()
337            .into_iter()
338            .collect();
339
340        let search_time = start_time.elapsed().as_millis() as u64;
341
342        Ok(DocumentationSearchResponse {
343            query: query.to_string(),
344            summary,
345            results: processed_results,
346            official_results_count: final_official_count,
347            used_fallback,
348            total_found: all_results.len(),
349            search_time_ms: search_time,
350            sources,
351            used_llm_verification: verification_result.is_some(),
352            verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
353        })
354    }
355
356    /// Generate concise summary without AI fluff
357    async fn generate_summary(
358        &self,
359        query: &str,
360        results: &[ProcessedSearchResult],
361    ) -> Result<String> {
362        if results.is_empty() {
363            return Ok("No relevant documentation found".to_string());
364        }
365
366        // Use LLM for intelligent summarization if available
367        if let Some(ref llm_client) = self.llm_client {
368            if llm_client.is_available() {
369                let _context = results
370                    .iter()
371                    .take(3) // Top 3 most relevant
372                    .map(|r| {
373                        format!(
374                            "Source: {} ({})\nContent: {}",
375                            r.source_domain,
376                            if r.is_official {
377                                "Official"
378                            } else {
379                                "Community"
380                            },
381                            r.snippet
382                        )
383                    })
384                    .collect::<Vec<_>>()
385                    .join("\n\n");
386
387                // Create mock search results for LLM synthesis
388                let mock_results: Vec<crate::rag::RagSearchResult> = results
389                    .iter()
390                    .take(3)
391                    .map(|r| crate::rag::RagSearchResult {
392                        id: r.url.clone(),
393                        content: r.snippet.clone(),
394                        source_path: std::path::PathBuf::from(&r.url),
395                        source_type: if r.is_official {
396                            crate::rag::SourceType::Curated
397                        } else {
398                            crate::rag::SourceType::Remote
399                        },
400                        title: Some(r.title.clone()),
401                        section: None,
402                        score: r.final_score,
403                        metadata: crate::rag::DocumentMetadata {
404                            file_type: "web".to_string(),
405                            size: r.snippet.len() as u64,
406                            modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
407                            tags: vec!["documentation".to_string()],
408                            language: Some("en".to_string()),
409                        },
410                    })
411                    .collect();
412
413                match llm_client.synthesize_answer(query, &mock_results).await {
414                    Ok(response) => return Ok(response.answer),
415                    Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
416                }
417            }
418        }
419
420        // Fallback: Generate summary from top results
421        let official_count = results.iter().filter(|r| r.is_official).count();
422        let summary_prefix = if official_count > 0 {
423            format!("From {} official sources", official_count)
424        } else {
425            "From community sources".to_string()
426        };
427
428        let top_content = results
429            .iter()
430            .take(2)
431            .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
432            .collect::<Vec<_>>()
433            .join(". ");
434
435        Ok(format!("{}: {}", summary_prefix, top_content))
436    }
437
438    /// Check if system is ready for searches
439    pub fn is_available(&self) -> bool {
440        self.config.enabled
441    }
442
443    /// Get configuration
444    pub fn config(&self) -> &WebSearchConfig {
445        &self.config
446    }
447
448    /// Build technical search query prioritizing dev docs, GitHub, StackOverflow
449    fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
450        let dev_domains = [
451            "github.com",
452            "stackoverflow.com",
453            "docs.rs",
454            "developer.mozilla.org",
455            "reactjs.org",
456            "nodejs.org",
457            "python.org",
458            "rust-lang.org",
459            "tauri.app",
460            "electronjs.org",
461            "dev.to",
462            "medium.com/@",
463        ];
464
465        // Combine framework-specific sites with general dev domains
466        let mut all_sites = framework_sites.to_vec();
467        all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
468
469        // Remove duplicates
470        all_sites.sort();
471        all_sites.dedup();
472
473        // Build site-restricted query
474        let site_filters: String = all_sites
475            .iter()
476            .map(|site| format!("site:{}", site))
477            .collect::<Vec<_>>()
478            .join(" OR ");
479
480        format!("({}) {}", site_filters, query)
481    }
482
483    /// Build developer-focused query with technical domain prioritization
484    fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
485        let mut dev_query = query.to_string();
486
487        // Add framework-specific terms to boost relevance
488        for framework in frameworks {
489            if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
490                dev_query = format!("{} {}", framework, dev_query);
491            }
492        }
493
494        // Technical domains to prioritize
495        let tech_domains = [
496            "site:github.com",
497            "site:stackoverflow.com",
498            "site:docs.rs",
499            "site:developer.mozilla.org",
500            "site:dev.to",
501        ];
502
503        // Add technical domain boost (not exclusive, just prioritized)
504        format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
505    }
506
507    /// Check if query is technical based on analysis
508    fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
509        // Technical indicators
510        !analysis.detected_frameworks.is_empty()
511            || analysis
512                .domain_context
513                .primary_domain
514                .contains("development")
515            || analysis
516                .domain_context
517                .primary_domain
518                .contains("programming")
519            || analysis.query_type == query_analyzer::QueryType::Reference
520            || analysis.original_query.to_lowercase().contains("api")
521            || analysis.original_query.to_lowercase().contains("code")
522            || analysis.original_query.to_lowercase().contains("library")
523            || analysis.original_query.to_lowercase().contains("function")
524            || analysis.original_query.to_lowercase().contains("method")
525            || analysis.original_query.to_lowercase().contains("component")
526    }
527}