manx_cli/web_search/
mod.rs

1//! Intelligent documentation search with DuckDuckGo + semantic embeddings
2//!
3//! This module provides official-first documentation search that:
4//! - Prioritizes official documentation sites by default
5//! - Falls back to trusted community sources with clear notifications
6//! - Uses semantic embeddings for relevance filtering
7//! - Optionally uses LLM for authenticity verification and summarization
8//! - Maintains privacy with anonymous DuckDuckGo searches
9
10use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod query_analyzer;
17pub mod result_processor;
18pub mod search_engine;
19
20/// Configuration for documentation search
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct WebSearchConfig {
23    pub enabled: bool,
24    pub max_results: usize,
25    pub similarity_threshold: f32,
26    pub search_timeout_seconds: u64,
27    pub user_agent: String,
28    pub min_official_results: usize, // Minimum official results before fallback
29}
30
31impl Default for WebSearchConfig {
32    fn default() -> Self {
33        Self {
34            enabled: true,
35            max_results: 8,
36            similarity_threshold: 0.6,
37            search_timeout_seconds: 10,
38            user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
39                .to_string(),
40            min_official_results: 3,
41        }
42    }
43}
44
45/// Raw search result from DuckDuckGo
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct RawSearchResult {
48    pub title: String,
49    pub url: String,
50    pub snippet: String,
51    pub source_domain: String,
52    pub timestamp: Option<DateTime<Utc>>,
53}
54
55/// Processed search result with relevance scoring
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct ProcessedSearchResult {
58    pub title: String,
59    pub url: String,
60    pub snippet: String,
61    pub source_domain: String,
62    pub is_official: bool,
63    pub source_tier: u8, // 1=Official docs, 2=Official repos, 3=Trusted community, 4=General
64    pub similarity_score: f32,
65    pub final_score: f32, // Combined similarity + official boost
66    pub timestamp: Option<DateTime<Utc>>,
67}
68
69/// Final documentation search response
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct DocumentationSearchResponse {
72    pub query: String,
73    pub summary: String,
74    pub results: Vec<ProcessedSearchResult>,
75    pub official_results_count: usize,
76    pub used_fallback: bool,
77    pub total_found: usize,
78    pub search_time_ms: u64,
79    pub sources: Vec<String>,
80    pub used_llm_verification: bool,
81    pub verification_passed: Option<bool>,
82}
83
84/// LLM verification response for search authenticity
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct VerificationResult {
87    pub is_authentic: bool,
88    pub confidence: f32,
89    pub reasoning: String,
90    pub suggested_refinement: Option<String>, // If search should be refined
91}
92
93/// Documentation search system
94pub struct DocumentationSearchSystem {
95    config: WebSearchConfig,
96    embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
97    llm_client: Option<crate::rag::llm::LlmClient>,
98    official_sources: official_sources::OfficialSourceManager,
99    query_analyzer: query_analyzer::QueryAnalyzer,
100}
101
102impl DocumentationSearchSystem {
103    /// Create new documentation search system
104    pub async fn new(
105        config: WebSearchConfig,
106        llm_config: Option<crate::rag::llm::LlmConfig>,
107        embedding_config: Option<crate::rag::EmbeddingConfig>,
108    ) -> Result<Self> {
109        if !config.enabled {
110            return Err(anyhow!("Documentation search is disabled"));
111        }
112
113        // Initialize semantic embeddings for similarity scoring
114        let embedding_model = match match &embedding_config {
115            Some(cfg) => crate::rag::embeddings::EmbeddingModel::new_with_config(cfg.clone()).await,
116            None => crate::rag::embeddings::EmbeddingModel::new().await,
117        } {
118            Ok(model) => {
119                log::info!("Semantic embeddings initialized for search");
120                Some(model)
121            }
122            Err(e) => {
123                log::warn!(
124                    "Semantic embeddings unavailable, using text matching: {}",
125                    e
126                );
127                None
128            }
129        };
130
131        // Initialize LLM client if configured
132        let llm_client = if let Some(llm_cfg) = llm_config {
133            match crate::rag::llm::LlmClient::new(llm_cfg) {
134                Ok(client) => {
135                    log::info!("LLM client initialized for result verification");
136                    Some(client)
137                }
138                Err(e) => {
139                    log::warn!("LLM client unavailable: {}", e);
140                    None
141                }
142            }
143        } else {
144            None
145        };
146
147        let official_sources = official_sources::OfficialSourceManager::new();
148        let query_analyzer = query_analyzer::QueryAnalyzer::new();
149
150        Ok(Self {
151            config,
152            embedding_model,
153            llm_client,
154            official_sources,
155            query_analyzer,
156        })
157    }
158
159    /// Search for documentation with official-first strategy
160    pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
161        let start_time = std::time::Instant::now();
162
163        log::info!("🔍 Searching official documentation for: {}", query);
164
165        // Step 0: Analyze query intent to enhance search strategy
166        let query_analysis = self
167            .query_analyzer
168            .analyze_query(query, self.llm_client.as_ref())
169            .await?;
170        log::info!(
171            "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
172            query_analysis.original_query,
173            query_analysis.enhanced_query,
174            query_analysis.confidence * 100.0
175        );
176
177        // Use enhanced query for better search results
178        let search_query = &query_analysis.enhanced_query;
179        // Prefer keeping key phrases intact (e.g., "hello world")
180        fn extract_key_phrase(q: &str) -> Option<String> {
181            let q = q.to_lowercase();
182            if let Some(start) = q.find('"') {
183                if let Some(end_rel) = q[start + 1..].find('"') {
184                    let end = start + 1 + end_rel;
185                    let phrase = &q[start + 1..end];
186                    if !phrase.trim().is_empty() {
187                        return Some(phrase.trim().to_string());
188                    }
189                }
190            }
191            // Fallback: first two content words (basic stopword filter)
192            let stop: std::collections::HashSet<&str> = [
193                "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with",
194                "using", "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
195            ]
196            .into_iter()
197            .collect();
198            let content: Vec<&str> = q
199                .split_whitespace()
200                .filter(|w| !stop.contains(*w))
201                .collect();
202            if content.len() >= 2 {
203                Some(format!("{} {}", content[0], content[1]))
204            } else {
205                None
206            }
207        }
208        let phrase_query = if let Some(p) = extract_key_phrase(&query_analysis.original_query) {
209            format!("\"{}\" {}", p, search_query)
210        } else {
211            search_query.to_string()
212        };
213
214        // Step 1: Apply smart search strategy (only when LLM is available)
215        let official_query = if self.llm_client.is_some() {
216            match &query_analysis.search_strategy {
217                query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
218                    log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
219                    self.build_technical_search_query(&phrase_query, sites)
220                }
221                query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
222                    log::info!(
223                        "📚 Using LLM-enhanced prioritized search for: {}",
224                        frameworks.join(", ")
225                    );
226                    self.build_dev_focused_query(&phrase_query, frameworks)
227                }
228                _ => {
229                    if self.is_technical_query(&query_analysis) {
230                        log::info!("🔧 Using LLM-enhanced technical search");
231                        self.build_dev_focused_query(&phrase_query, &[])
232                    } else {
233                        self.official_sources.build_official_query(&phrase_query)
234                    }
235                }
236            }
237        } else {
238            // Default behavior when no LLM - unchanged original functionality
239            log::debug!("Using standard search (no LLM configured)");
240            self.official_sources.build_official_query(&phrase_query)
241        };
242        let mut all_results = search_engine::search_duckduckgo(
243            &official_query,
244            self.config.max_results,
245            &self.config.user_agent,
246            self.config.search_timeout_seconds,
247        )
248        .await?;
249
250        let mut used_fallback = false;
251
252        // Step 2: Check if we have enough official results
253        let official_results_count = all_results
254            .iter()
255            .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
256            .count();
257
258        // Step 3: Fallback to general search if insufficient official results
259        if official_results_count < self.config.min_official_results {
260            log::info!(
261                "⚠️ Only {} official results found, expanding search...",
262                official_results_count
263            );
264            used_fallback = true;
265
266            // Search without site restrictions
267            let fallback_results = search_engine::search_duckduckgo(
268                &phrase_query,
269                self.config.max_results,
270                &self.config.user_agent,
271                self.config.search_timeout_seconds,
272            )
273            .await?;
274
275            // Merge results, avoiding duplicates
276            for result in fallback_results {
277                if !all_results.iter().any(|r| r.url == result.url) {
278                    all_results.push(result);
279                }
280            }
281        }
282
283        if all_results.is_empty() {
284            return Ok(DocumentationSearchResponse {
285                query: query.to_string(),
286                summary: "No relevant documentation found".to_string(),
287                results: vec![],
288                official_results_count: 0,
289                used_fallback: false,
290                total_found: 0,
291                search_time_ms: start_time.elapsed().as_millis() as u64,
292                sources: vec![],
293                used_llm_verification: false,
294                verification_passed: None,
295            });
296        }
297
298        // Step 4: Process results with enhanced semantic filtering and query analysis
299        let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
300            result_processor::process_with_embeddings_and_analysis(
301                &query_analysis,
302                &all_results,
303                embedding_model,
304                &self.official_sources,
305                self.config.similarity_threshold,
306            )
307            .await?
308        } else {
309            result_processor::process_without_embeddings(
310                query,
311                &all_results,
312                &self.official_sources,
313            )
314        };
315
316        // Fallback: if nothing survived filtering, retry with text matching (softer) before final filters
317        if processed_results.is_empty() {
318            log::info!("No results after semantic filtering; retrying with text matching fallback");
319            processed_results = result_processor::process_without_embeddings(
320                query,
321                &all_results,
322                &self.official_sources,
323            );
324        }
325
326        // Step 4a: Enhance results with additional metadata
327        result_processor::enhance_results(&mut processed_results, &self.official_sources);
328
329        // Log tier information for debugging
330        for result in &processed_results {
331            let tier = self
332                .official_sources
333                .get_source_tier(&result.source_domain, &result.url);
334            log::debug!(
335                "Source: {} - Tier: {} - Score: {}",
336                result.source_domain,
337                self.official_sources.get_tier_description(&tier),
338                result.final_score
339            );
340        }
341
342        // Step 4b: Filter out non-technical domains (LLM-enhanced only)
343        processed_results = result_processor::filter_non_technical_domains(
344            processed_results,
345            &query_analysis,
346            self.llm_client.is_some(),
347        );
348
349        // Step 4c: Filter out low-quality results
350        processed_results = result_processor::filter_quality_results(processed_results, 20);
351
352        // Step 4d: Remove duplicates
353        let mut processed_results = result_processor::deduplicate_results(processed_results);
354
355        // Second-chance fallback: if filters resulted in zero items, retry with softer text matching
356        if processed_results.is_empty() {
357            log::info!("No results after filtering; retrying with softer text-based processing");
358            let mut soft_results = result_processor::process_without_embeddings(
359                query,
360                &all_results,
361                &self.official_sources,
362            );
363            // Use a smaller snippet length minimum for soft pass
364            soft_results = result_processor::filter_quality_results(soft_results, 10);
365            processed_results = result_processor::deduplicate_results(soft_results);
366        }
367
368        // Step 5: LLM verification if available
369        let verification_result = if let Some(ref llm_client) = self.llm_client {
370            if llm_client.is_available() {
371                log::info!("Verifying results with LLM");
372                match llm_verifier::verify_search_results(query, &processed_results).await {
373                    Ok(verification) => Some(verification),
374                    Err(e) => {
375                        log::warn!("LLM verification failed: {}", e);
376                        None
377                    }
378                }
379            } else {
380                None
381            }
382        } else {
383            None
384        };
385
386        // Step 6: Generate summary
387        let summary = self.generate_summary(query, &processed_results).await?;
388
389        // Calculate final stats
390        let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
391
392        let sources: Vec<String> = processed_results
393            .iter()
394            .map(|r| r.source_domain.clone())
395            .collect::<std::collections::HashSet<_>>()
396            .into_iter()
397            .collect();
398
399        let search_time = start_time.elapsed().as_millis() as u64;
400
401        Ok(DocumentationSearchResponse {
402            query: query.to_string(),
403            summary,
404            results: processed_results,
405            official_results_count: final_official_count,
406            used_fallback,
407            total_found: all_results.len(),
408            search_time_ms: search_time,
409            sources,
410            used_llm_verification: verification_result.is_some(),
411            verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
412        })
413    }
414
415    /// Generate concise summary without AI fluff
416    async fn generate_summary(
417        &self,
418        query: &str,
419        results: &[ProcessedSearchResult],
420    ) -> Result<String> {
421        if results.is_empty() {
422            return Ok("No relevant documentation found".to_string());
423        }
424
425        // Use LLM for intelligent summarization if available
426        if let Some(ref llm_client) = self.llm_client {
427            if llm_client.is_available() {
428                let _context = results
429                    .iter()
430                    .take(3) // Top 3 most relevant
431                    .map(|r| {
432                        format!(
433                            "Source: {} ({})\nContent: {}",
434                            r.source_domain,
435                            if r.is_official {
436                                "Official"
437                            } else {
438                                "Community"
439                            },
440                            r.snippet
441                        )
442                    })
443                    .collect::<Vec<_>>()
444                    .join("\n\n");
445
446                // Create mock search results for LLM synthesis
447                let mock_results: Vec<crate::rag::RagSearchResult> = results
448                    .iter()
449                    .take(3)
450                    .map(|r| crate::rag::RagSearchResult {
451                        id: r.url.clone(),
452                        content: r.snippet.clone(),
453                        source_path: std::path::PathBuf::from(&r.url),
454                        source_type: if r.is_official {
455                            crate::rag::SourceType::Curated
456                        } else {
457                            crate::rag::SourceType::Remote
458                        },
459                        title: Some(r.title.clone()),
460                        section: None,
461                        score: r.final_score,
462                        chunk_index: 0,
463                        metadata: crate::rag::DocumentMetadata {
464                            file_type: "web".to_string(),
465                            size: r.snippet.len() as u64,
466                            modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
467                            tags: vec!["documentation".to_string()],
468                            language: Some("en".to_string()),
469                        },
470                    })
471                    .collect();
472
473                match llm_client.synthesize_answer(query, &mock_results).await {
474                    Ok(response) => return Ok(response.answer),
475                    Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
476                }
477            }
478        }
479
480        // Fallback: Generate summary from top results
481        let official_count = results.iter().filter(|r| r.is_official).count();
482        let summary_prefix = if official_count > 0 {
483            format!("From {} official sources", official_count)
484        } else {
485            "From community sources".to_string()
486        };
487
488        let top_content = results
489            .iter()
490            .take(2)
491            .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
492            .collect::<Vec<_>>()
493            .join(". ");
494
495        Ok(format!("{}: {}", summary_prefix, top_content))
496    }
497
498    /// Check if system is ready for searches
499    pub fn is_available(&self) -> bool {
500        self.config.enabled
501    }
502
503    /// Get configuration
504    pub fn config(&self) -> &WebSearchConfig {
505        &self.config
506    }
507
508    /// Build technical search query prioritizing dev docs, GitHub, StackOverflow
509    fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
510        let dev_domains = [
511            "github.com",
512            "stackoverflow.com",
513            "docs.rs",
514            "developer.mozilla.org",
515            "reactjs.org",
516            "nodejs.org",
517            "python.org",
518            "rust-lang.org",
519            "tauri.app",
520            "electronjs.org",
521            "dev.to",
522            "medium.com/@",
523        ];
524
525        // Combine framework-specific sites with general dev domains
526        let mut all_sites = framework_sites.to_vec();
527        all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
528
529        // Remove duplicates
530        all_sites.sort();
531        all_sites.dedup();
532
533        // Build site-restricted query
534        let site_filters: String = all_sites
535            .iter()
536            .map(|site| format!("site:{}", site))
537            .collect::<Vec<_>>()
538            .join(" OR ");
539
540        format!("({}) {}", site_filters, query)
541    }
542
543    /// Build developer-focused query with technical domain prioritization
544    fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
545        let mut dev_query = query.to_string();
546
547        // Add framework-specific terms to boost relevance
548        for framework in frameworks {
549            if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
550                dev_query = format!("{} {}", framework, dev_query);
551            }
552        }
553
554        // Technical domains to prioritize
555        let tech_domains = [
556            "site:github.com",
557            "site:stackoverflow.com",
558            "site:docs.rs",
559            "site:developer.mozilla.org",
560            "site:dev.to",
561        ];
562
563        // Add technical domain boost (not exclusive, just prioritized)
564        format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
565    }
566
567    /// Check if query is technical based on analysis
568    fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
569        // Technical indicators
570        !analysis.detected_frameworks.is_empty()
571            || analysis
572                .domain_context
573                .primary_domain
574                .contains("development")
575            || analysis
576                .domain_context
577                .primary_domain
578                .contains("programming")
579            || analysis.query_type == query_analyzer::QueryType::Reference
580            || analysis.original_query.to_lowercase().contains("api")
581            || analysis.original_query.to_lowercase().contains("code")
582            || analysis.original_query.to_lowercase().contains("library")
583            || analysis.original_query.to_lowercase().contains("function")
584            || analysis.original_query.to_lowercase().contains("method")
585            || analysis.original_query.to_lowercase().contains("component")
586    }
587}