Skip to main content

manx_cli/web_search/
mod.rs

1//! Intelligent documentation search with DuckDuckGo + semantic embeddings
2//!
3//! This module provides official-first documentation search that:
4//! - Prioritizes official documentation sites by default
5//! - Falls back to trusted community sources with clear notifications
6//! - Uses semantic embeddings for relevance filtering
7//! - Optionally uses LLM for authenticity verification and summarization
8//! - Maintains privacy with anonymous DuckDuckGo searches
9
10use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13use std::sync::Arc;
14
15pub mod llm_verifier;
16pub mod official_sources;
17pub mod query_analyzer;
18pub mod result_processor;
19pub mod search_engine;
20
21/// Configuration for documentation search
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct WebSearchConfig {
24    pub enabled: bool,
25    pub max_results: usize,
26    pub similarity_threshold: f32,
27    pub search_timeout_seconds: u64,
28    pub user_agent: String,
29    pub min_official_results: usize, // Minimum official results before fallback
30}
31
32impl Default for WebSearchConfig {
33    fn default() -> Self {
34        Self {
35            enabled: true,
36            max_results: 8,
37            similarity_threshold: 0.6,
38            search_timeout_seconds: 10,
39            user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
40                .to_string(),
41            min_official_results: 3,
42        }
43    }
44}
45
46/// Raw search result from DuckDuckGo
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct RawSearchResult {
49    pub title: String,
50    pub url: String,
51    pub snippet: String,
52    pub source_domain: String,
53    pub timestamp: Option<DateTime<Utc>>,
54}
55
56/// Processed search result with relevance scoring
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct ProcessedSearchResult {
59    pub title: String,
60    pub url: String,
61    pub snippet: String,
62    pub source_domain: String,
63    pub is_official: bool,
64    pub source_tier: u8, // 1=Official docs, 2=Official repos, 3=Trusted community, 4=General
65    pub similarity_score: f32,
66    pub final_score: f32, // Combined similarity + official boost
67    pub timestamp: Option<DateTime<Utc>>,
68}
69
70/// Final documentation search response
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct DocumentationSearchResponse {
73    pub query: String,
74    pub summary: String,
75    pub results: Vec<ProcessedSearchResult>,
76    pub official_results_count: usize,
77    pub used_fallback: bool,
78    pub total_found: usize,
79    pub search_time_ms: u64,
80    pub sources: Vec<String>,
81    pub used_llm_verification: bool,
82    pub verification_passed: Option<bool>,
83}
84
85/// LLM verification response for search authenticity
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct VerificationResult {
88    pub is_authentic: bool,
89    pub confidence: f32,
90    pub reasoning: String,
91    pub suggested_refinement: Option<String>, // If search should be refined
92}
93
94/// Documentation search system
95pub struct DocumentationSearchSystem {
96    config: WebSearchConfig,
97    embedding_model: Option<Arc<crate::rag::embeddings::EmbeddingModel>>,
98    llm_client: Option<Arc<crate::rag::llm::LlmClient>>,
99    official_sources: official_sources::OfficialSourceManager,
100    query_analyzer: query_analyzer::QueryAnalyzer,
101}
102
103impl DocumentationSearchSystem {
104    /// Create new documentation search system
105    pub async fn new(
106        config: WebSearchConfig,
107        llm_config: Option<crate::rag::llm::LlmConfig>,
108        embedding_config: Option<crate::rag::EmbeddingConfig>,
109    ) -> Result<Self> {
110        if !config.enabled {
111            return Err(anyhow!("Documentation search is disabled"));
112        }
113
114        // Initialize semantic embeddings for similarity scoring (with Arc for sharing)
115        let embedding_model = match match &embedding_config {
116            Some(cfg) => crate::rag::embeddings::EmbeddingModel::new_with_config(cfg.clone()).await,
117            None => crate::rag::embeddings::EmbeddingModel::new().await,
118        } {
119            Ok(model) => {
120                log::info!("Semantic embeddings initialized for search (pooled)");
121                Some(Arc::new(model))
122            }
123            Err(e) => {
124                log::warn!(
125                    "Semantic embeddings unavailable, using text matching: {}",
126                    e
127                );
128                None
129            }
130        };
131
132        // Initialize LLM client if configured (with Arc for sharing)
133        let llm_client = if let Some(llm_cfg) = llm_config {
134            match crate::rag::llm::LlmClient::new(llm_cfg) {
135                Ok(client) => {
136                    log::info!("LLM client initialized for result verification (pooled)");
137                    Some(Arc::new(client))
138                }
139                Err(e) => {
140                    log::warn!("LLM client unavailable: {}", e);
141                    None
142                }
143            }
144        } else {
145            None
146        };
147
148        let official_sources = official_sources::OfficialSourceManager::new();
149        let query_analyzer = query_analyzer::QueryAnalyzer::new();
150
151        Ok(Self {
152            config,
153            embedding_model,
154            llm_client,
155            official_sources,
156            query_analyzer,
157        })
158    }
159
160    /// Search for documentation with official-first strategy
161    pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
162        let start_time = std::time::Instant::now();
163
164        log::info!("🔍 Searching official documentation for: {}", query);
165
166        // Step 0: Analyze query intent to enhance search strategy
167        let query_analysis = self
168            .query_analyzer
169            .analyze_query(query, self.llm_client.as_deref())
170            .await?;
171        log::info!(
172            "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
173            query_analysis.original_query,
174            query_analysis.enhanced_query,
175            query_analysis.confidence * 100.0
176        );
177
178        // Use enhanced query for better search results
179        let search_query = &query_analysis.enhanced_query;
180        // Prefer keeping key phrases intact (e.g., "hello world")
181        fn extract_key_phrase(q: &str) -> Option<String> {
182            let q = q.to_lowercase();
183            if let Some(start) = q.find('"') {
184                if let Some(end_rel) = q[start + 1..].find('"') {
185                    let end = start + 1 + end_rel;
186                    let phrase = &q[start + 1..end];
187                    if !phrase.trim().is_empty() {
188                        return Some(phrase.trim().to_string());
189                    }
190                }
191            }
192            // Fallback: first two content words (basic stopword filter)
193            let stop: std::collections::HashSet<&str> = [
194                "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with",
195                "using", "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
196            ]
197            .into_iter()
198            .collect();
199            let content: Vec<&str> = q
200                .split_whitespace()
201                .filter(|w| !stop.contains(*w))
202                .collect();
203            if content.len() >= 2 {
204                Some(format!("{} {}", content[0], content[1]))
205            } else {
206                None
207            }
208        }
209        let phrase_query = if let Some(p) = extract_key_phrase(&query_analysis.original_query) {
210            format!("\"{}\" {}", p, search_query)
211        } else {
212            search_query.to_string()
213        };
214
215        // Step 1: Apply smart search strategy (only when LLM is available)
216        let official_query = if self.llm_client.is_some() {
217            match &query_analysis.search_strategy {
218                query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
219                    log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
220                    self.build_technical_search_query(&phrase_query, sites)
221                }
222                query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
223                    log::info!(
224                        "📚 Using LLM-enhanced prioritized search for: {}",
225                        frameworks.join(", ")
226                    );
227                    self.build_dev_focused_query(&phrase_query, frameworks)
228                }
229                _ => {
230                    if self.is_technical_query(&query_analysis) {
231                        log::info!("🔧 Using LLM-enhanced technical search");
232                        self.build_dev_focused_query(&phrase_query, &[])
233                    } else {
234                        self.official_sources.build_official_query(&phrase_query)
235                    }
236                }
237            }
238        } else {
239            // Default behavior when no LLM - unchanged original functionality
240            log::debug!("Using standard search (no LLM configured)");
241            self.official_sources.build_official_query(&phrase_query)
242        };
243        let mut all_results = search_engine::search_duckduckgo(
244            &official_query,
245            self.config.max_results,
246            &self.config.user_agent,
247            self.config.search_timeout_seconds,
248        )
249        .await?;
250
251        let mut used_fallback = false;
252
253        // Step 2: Check if we have enough official results
254        let official_results_count = all_results
255            .iter()
256            .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
257            .count();
258
259        // Step 3: Fallback to general search if insufficient official results
260        if official_results_count < self.config.min_official_results {
261            log::info!(
262                "⚠️ Only {} official results found, expanding search...",
263                official_results_count
264            );
265            used_fallback = true;
266
267            // Search without site restrictions
268            let fallback_results = search_engine::search_duckduckgo(
269                &phrase_query,
270                self.config.max_results,
271                &self.config.user_agent,
272                self.config.search_timeout_seconds,
273            )
274            .await?;
275
276            // Merge results, avoiding duplicates
277            for result in fallback_results {
278                if !all_results.iter().any(|r| r.url == result.url) {
279                    all_results.push(result);
280                }
281            }
282        }
283
284        if all_results.is_empty() {
285            return Ok(DocumentationSearchResponse {
286                query: query.to_string(),
287                summary: "No relevant documentation found".to_string(),
288                results: vec![],
289                official_results_count: 0,
290                used_fallback: false,
291                total_found: 0,
292                search_time_ms: start_time.elapsed().as_millis() as u64,
293                sources: vec![],
294                used_llm_verification: false,
295                verification_passed: None,
296            });
297        }
298
299        // Step 4: Process results with enhanced semantic filtering and query analysis
300        let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
301            result_processor::process_with_embeddings_and_analysis(
302                &query_analysis,
303                &all_results,
304                embedding_model,
305                &self.official_sources,
306                self.config.similarity_threshold,
307            )
308            .await?
309        } else {
310            result_processor::process_without_embeddings(
311                query,
312                &all_results,
313                &self.official_sources,
314            )
315        };
316
317        // Fallback: if nothing survived filtering, retry with text matching (softer) before final filters
318        if processed_results.is_empty() {
319            log::info!("No results after semantic filtering; retrying with text matching fallback");
320            processed_results = result_processor::process_without_embeddings(
321                query,
322                &all_results,
323                &self.official_sources,
324            );
325        }
326
327        // Step 4a: Enhance results with additional metadata
328        result_processor::enhance_results(&mut processed_results, &self.official_sources);
329
330        // Log tier information for debugging
331        for result in &processed_results {
332            let tier = self
333                .official_sources
334                .get_source_tier(&result.source_domain, &result.url);
335            log::debug!(
336                "Source: {} - Tier: {} - Score: {}",
337                result.source_domain,
338                self.official_sources.get_tier_description(&tier),
339                result.final_score
340            );
341        }
342
343        // Step 4b: Filter out non-technical domains (LLM-enhanced only)
344        processed_results = result_processor::filter_non_technical_domains(
345            processed_results,
346            &query_analysis,
347            self.llm_client.is_some(),
348        );
349
350        // Step 4c: Filter out low-quality results
351        processed_results = result_processor::filter_quality_results(processed_results, 20);
352
353        // Step 4d: Remove duplicates
354        let mut processed_results = result_processor::deduplicate_results(processed_results);
355
356        // Second-chance fallback: if filters resulted in zero items, retry with softer text matching
357        if processed_results.is_empty() {
358            log::info!("No results after filtering; retrying with softer text-based processing");
359            let mut soft_results = result_processor::process_without_embeddings(
360                query,
361                &all_results,
362                &self.official_sources,
363            );
364            // Use a smaller snippet length minimum for soft pass
365            soft_results = result_processor::filter_quality_results(soft_results, 10);
366            processed_results = result_processor::deduplicate_results(soft_results);
367        }
368
369        // Step 5: LLM verification if available
370        let verification_result = if let Some(ref llm_client) = self.llm_client {
371            if llm_client.is_available() {
372                log::info!("Verifying results with LLM");
373                match llm_verifier::verify_search_results(query, &processed_results).await {
374                    Ok(verification) => Some(verification),
375                    Err(e) => {
376                        log::warn!("LLM verification failed: {}", e);
377                        None
378                    }
379                }
380            } else {
381                None
382            }
383        } else {
384            None
385        };
386
387        // Step 6: Generate summary
388        let summary = self.generate_summary(query, &processed_results).await?;
389
390        // Calculate final stats
391        let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
392
393        let sources: Vec<String> = processed_results
394            .iter()
395            .map(|r| r.source_domain.clone())
396            .collect::<std::collections::HashSet<_>>()
397            .into_iter()
398            .collect();
399
400        let search_time = start_time.elapsed().as_millis() as u64;
401
402        Ok(DocumentationSearchResponse {
403            query: query.to_string(),
404            summary,
405            results: processed_results,
406            official_results_count: final_official_count,
407            used_fallback,
408            total_found: all_results.len(),
409            search_time_ms: search_time,
410            sources,
411            used_llm_verification: verification_result.is_some(),
412            verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
413        })
414    }
415
416    /// Generate concise summary without AI fluff
417    async fn generate_summary(
418        &self,
419        query: &str,
420        results: &[ProcessedSearchResult],
421    ) -> Result<String> {
422        if results.is_empty() {
423            return Ok("No relevant documentation found".to_string());
424        }
425
426        // Use LLM for intelligent summarization if available
427        if let Some(ref llm_client) = self.llm_client {
428            if llm_client.is_available() {
429                let _context = results
430                    .iter()
431                    .take(3) // Top 3 most relevant
432                    .map(|r| {
433                        format!(
434                            "Source: {} ({})\nContent: {}",
435                            r.source_domain,
436                            if r.is_official {
437                                "Official"
438                            } else {
439                                "Community"
440                            },
441                            r.snippet
442                        )
443                    })
444                    .collect::<Vec<_>>()
445                    .join("\n\n");
446
447                // Create mock search results for LLM synthesis
448                let mock_results: Vec<crate::rag::RagSearchResult> = results
449                    .iter()
450                    .take(3)
451                    .map(|r| crate::rag::RagSearchResult {
452                        id: r.url.clone(),
453                        content: r.snippet.clone(),
454                        source_path: std::path::PathBuf::from(&r.url),
455                        source_type: if r.is_official {
456                            crate::rag::SourceType::Curated
457                        } else {
458                            crate::rag::SourceType::Remote
459                        },
460                        title: Some(r.title.clone()),
461                        section: None,
462                        score: r.final_score,
463                        chunk_index: 0,
464                        metadata: crate::rag::DocumentMetadata {
465                            file_type: "web".to_string(),
466                            size: r.snippet.len() as u64,
467                            modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
468                            tags: vec!["documentation".to_string()],
469                            language: Some("en".to_string()),
470                        },
471                    })
472                    .collect();
473
474                match llm_client.synthesize_answer(query, &mock_results).await {
475                    Ok(response) => return Ok(response.answer),
476                    Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
477                }
478            }
479        }
480
481        // Fallback: Generate summary from top results
482        let official_count = results.iter().filter(|r| r.is_official).count();
483        let summary_prefix = if official_count > 0 {
484            format!("From {} official sources", official_count)
485        } else {
486            "From community sources".to_string()
487        };
488
489        let top_content = results
490            .iter()
491            .take(2)
492            .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
493            .collect::<Vec<_>>()
494            .join(". ");
495
496        Ok(format!("{}: {}", summary_prefix, top_content))
497    }
498
499    /// Check if system is ready for searches
500    pub fn is_available(&self) -> bool {
501        self.config.enabled
502    }
503
504    /// Get configuration
505    pub fn config(&self) -> &WebSearchConfig {
506        &self.config
507    }
508
509    /// Build technical search query prioritizing dev docs, GitHub, StackOverflow
510    fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
511        let dev_domains = [
512            "github.com",
513            "stackoverflow.com",
514            "docs.rs",
515            "developer.mozilla.org",
516            "reactjs.org",
517            "nodejs.org",
518            "python.org",
519            "rust-lang.org",
520            "tauri.app",
521            "electronjs.org",
522            "dev.to",
523            "medium.com/@",
524        ];
525
526        // Combine framework-specific sites with general dev domains
527        let mut all_sites = framework_sites.to_vec();
528        all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
529
530        // Remove duplicates
531        all_sites.sort();
532        all_sites.dedup();
533
534        // Build site-restricted query
535        let site_filters: String = all_sites
536            .iter()
537            .map(|site| format!("site:{}", site))
538            .collect::<Vec<_>>()
539            .join(" OR ");
540
541        format!("({}) {}", site_filters, query)
542    }
543
544    /// Build developer-focused query with technical domain prioritization
545    fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
546        let mut dev_query = query.to_string();
547
548        // Add framework-specific terms to boost relevance
549        for framework in frameworks {
550            if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
551                dev_query = format!("{} {}", framework, dev_query);
552            }
553        }
554
555        // Technical domains to prioritize
556        let tech_domains = [
557            "site:github.com",
558            "site:stackoverflow.com",
559            "site:docs.rs",
560            "site:developer.mozilla.org",
561            "site:dev.to",
562        ];
563
564        // Add technical domain boost (not exclusive, just prioritized)
565        format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
566    }
567
568    /// Check if query is technical based on analysis
569    fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
570        // Technical indicators
571        !analysis.detected_frameworks.is_empty()
572            || analysis
573                .domain_context
574                .primary_domain
575                .contains("development")
576            || analysis
577                .domain_context
578                .primary_domain
579                .contains("programming")
580            || analysis.query_type == query_analyzer::QueryType::Reference
581            || analysis.original_query.to_lowercase().contains("api")
582            || analysis.original_query.to_lowercase().contains("code")
583            || analysis.original_query.to_lowercase().contains("library")
584            || analysis.original_query.to_lowercase().contains("function")
585            || analysis.original_query.to_lowercase().contains("method")
586            || analysis.original_query.to_lowercase().contains("component")
587    }
588}