1use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13use std::sync::Arc;
14
15pub mod llm_verifier;
16pub mod official_sources;
17pub mod query_analyzer;
18pub mod result_processor;
19pub mod search_engine;
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct WebSearchConfig {
24 pub enabled: bool,
25 pub max_results: usize,
26 pub similarity_threshold: f32,
27 pub search_timeout_seconds: u64,
28 pub user_agent: String,
29 pub min_official_results: usize, }
31
32impl Default for WebSearchConfig {
33 fn default() -> Self {
34 Self {
35 enabled: true,
36 max_results: 8,
37 similarity_threshold: 0.6,
38 search_timeout_seconds: 10,
39 user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
40 .to_string(),
41 min_official_results: 3,
42 }
43 }
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct RawSearchResult {
49 pub title: String,
50 pub url: String,
51 pub snippet: String,
52 pub source_domain: String,
53 pub timestamp: Option<DateTime<Utc>>,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct ProcessedSearchResult {
59 pub title: String,
60 pub url: String,
61 pub snippet: String,
62 pub source_domain: String,
63 pub is_official: bool,
64 pub source_tier: u8, pub similarity_score: f32,
66 pub final_score: f32, pub timestamp: Option<DateTime<Utc>>,
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct DocumentationSearchResponse {
73 pub query: String,
74 pub summary: String,
75 pub results: Vec<ProcessedSearchResult>,
76 pub official_results_count: usize,
77 pub used_fallback: bool,
78 pub total_found: usize,
79 pub search_time_ms: u64,
80 pub sources: Vec<String>,
81 pub used_llm_verification: bool,
82 pub verification_passed: Option<bool>,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct VerificationResult {
88 pub is_authentic: bool,
89 pub confidence: f32,
90 pub reasoning: String,
91 pub suggested_refinement: Option<String>, }
93
94pub struct DocumentationSearchSystem {
96 config: WebSearchConfig,
97 embedding_model: Option<Arc<crate::rag::embeddings::EmbeddingModel>>,
98 llm_client: Option<Arc<crate::rag::llm::LlmClient>>,
99 official_sources: official_sources::OfficialSourceManager,
100 query_analyzer: query_analyzer::QueryAnalyzer,
101}
102
103impl DocumentationSearchSystem {
104 pub async fn new(
106 config: WebSearchConfig,
107 llm_config: Option<crate::rag::llm::LlmConfig>,
108 embedding_config: Option<crate::rag::EmbeddingConfig>,
109 ) -> Result<Self> {
110 if !config.enabled {
111 return Err(anyhow!("Documentation search is disabled"));
112 }
113
114 let embedding_model = match match &embedding_config {
116 Some(cfg) => crate::rag::embeddings::EmbeddingModel::new_with_config(cfg.clone()).await,
117 None => crate::rag::embeddings::EmbeddingModel::new().await,
118 } {
119 Ok(model) => {
120 log::info!("Semantic embeddings initialized for search (pooled)");
121 Some(Arc::new(model))
122 }
123 Err(e) => {
124 log::warn!(
125 "Semantic embeddings unavailable, using text matching: {}",
126 e
127 );
128 None
129 }
130 };
131
132 let llm_client = if let Some(llm_cfg) = llm_config {
134 match crate::rag::llm::LlmClient::new(llm_cfg) {
135 Ok(client) => {
136 log::info!("LLM client initialized for result verification (pooled)");
137 Some(Arc::new(client))
138 }
139 Err(e) => {
140 log::warn!("LLM client unavailable: {}", e);
141 None
142 }
143 }
144 } else {
145 None
146 };
147
148 let official_sources = official_sources::OfficialSourceManager::new();
149 let query_analyzer = query_analyzer::QueryAnalyzer::new();
150
151 Ok(Self {
152 config,
153 embedding_model,
154 llm_client,
155 official_sources,
156 query_analyzer,
157 })
158 }
159
160 pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
162 let start_time = std::time::Instant::now();
163
164 log::info!("🔍 Searching official documentation for: {}", query);
165
166 let query_analysis = self
168 .query_analyzer
169 .analyze_query(query, self.llm_client.as_deref())
170 .await?;
171 log::info!(
172 "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
173 query_analysis.original_query,
174 query_analysis.enhanced_query,
175 query_analysis.confidence * 100.0
176 );
177
178 let search_query = &query_analysis.enhanced_query;
180 fn extract_key_phrase(q: &str) -> Option<String> {
182 let q = q.to_lowercase();
183 if let Some(start) = q.find('"') {
184 if let Some(end_rel) = q[start + 1..].find('"') {
185 let end = start + 1 + end_rel;
186 let phrase = &q[start + 1..end];
187 if !phrase.trim().is_empty() {
188 return Some(phrase.trim().to_string());
189 }
190 }
191 }
192 let stop: std::collections::HashSet<&str> = [
194 "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with",
195 "using", "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
196 ]
197 .into_iter()
198 .collect();
199 let content: Vec<&str> = q
200 .split_whitespace()
201 .filter(|w| !stop.contains(*w))
202 .collect();
203 if content.len() >= 2 {
204 Some(format!("{} {}", content[0], content[1]))
205 } else {
206 None
207 }
208 }
209 let phrase_query = if let Some(p) = extract_key_phrase(&query_analysis.original_query) {
210 format!("\"{}\" {}", p, search_query)
211 } else {
212 search_query.to_string()
213 };
214
215 let official_query = if self.llm_client.is_some() {
217 match &query_analysis.search_strategy {
218 query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
219 log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
220 self.build_technical_search_query(&phrase_query, sites)
221 }
222 query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
223 log::info!(
224 "📚 Using LLM-enhanced prioritized search for: {}",
225 frameworks.join(", ")
226 );
227 self.build_dev_focused_query(&phrase_query, frameworks)
228 }
229 _ => {
230 if self.is_technical_query(&query_analysis) {
231 log::info!("🔧 Using LLM-enhanced technical search");
232 self.build_dev_focused_query(&phrase_query, &[])
233 } else {
234 self.official_sources.build_official_query(&phrase_query)
235 }
236 }
237 }
238 } else {
239 log::debug!("Using standard search (no LLM configured)");
241 self.official_sources.build_official_query(&phrase_query)
242 };
243 let mut all_results = search_engine::search_duckduckgo(
244 &official_query,
245 self.config.max_results,
246 &self.config.user_agent,
247 self.config.search_timeout_seconds,
248 )
249 .await?;
250
251 let mut used_fallback = false;
252
253 let official_results_count = all_results
255 .iter()
256 .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
257 .count();
258
259 if official_results_count < self.config.min_official_results {
261 log::info!(
262 "⚠️ Only {} official results found, expanding search...",
263 official_results_count
264 );
265 used_fallback = true;
266
267 let fallback_results = search_engine::search_duckduckgo(
269 &phrase_query,
270 self.config.max_results,
271 &self.config.user_agent,
272 self.config.search_timeout_seconds,
273 )
274 .await?;
275
276 for result in fallback_results {
278 if !all_results.iter().any(|r| r.url == result.url) {
279 all_results.push(result);
280 }
281 }
282 }
283
284 if all_results.is_empty() {
285 return Ok(DocumentationSearchResponse {
286 query: query.to_string(),
287 summary: "No relevant documentation found".to_string(),
288 results: vec![],
289 official_results_count: 0,
290 used_fallback: false,
291 total_found: 0,
292 search_time_ms: start_time.elapsed().as_millis() as u64,
293 sources: vec![],
294 used_llm_verification: false,
295 verification_passed: None,
296 });
297 }
298
299 let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
301 result_processor::process_with_embeddings_and_analysis(
302 &query_analysis,
303 &all_results,
304 embedding_model,
305 &self.official_sources,
306 self.config.similarity_threshold,
307 )
308 .await?
309 } else {
310 result_processor::process_without_embeddings(
311 query,
312 &all_results,
313 &self.official_sources,
314 )
315 };
316
317 if processed_results.is_empty() {
319 log::info!("No results after semantic filtering; retrying with text matching fallback");
320 processed_results = result_processor::process_without_embeddings(
321 query,
322 &all_results,
323 &self.official_sources,
324 );
325 }
326
327 result_processor::enhance_results(&mut processed_results, &self.official_sources);
329
330 for result in &processed_results {
332 let tier = self
333 .official_sources
334 .get_source_tier(&result.source_domain, &result.url);
335 log::debug!(
336 "Source: {} - Tier: {} - Score: {}",
337 result.source_domain,
338 self.official_sources.get_tier_description(&tier),
339 result.final_score
340 );
341 }
342
343 processed_results = result_processor::filter_non_technical_domains(
345 processed_results,
346 &query_analysis,
347 self.llm_client.is_some(),
348 );
349
350 processed_results = result_processor::filter_quality_results(processed_results, 20);
352
353 let mut processed_results = result_processor::deduplicate_results(processed_results);
355
356 if processed_results.is_empty() {
358 log::info!("No results after filtering; retrying with softer text-based processing");
359 let mut soft_results = result_processor::process_without_embeddings(
360 query,
361 &all_results,
362 &self.official_sources,
363 );
364 soft_results = result_processor::filter_quality_results(soft_results, 10);
366 processed_results = result_processor::deduplicate_results(soft_results);
367 }
368
369 let verification_result = if let Some(ref llm_client) = self.llm_client {
371 if llm_client.is_available() {
372 log::info!("Verifying results with LLM");
373 match llm_verifier::verify_search_results(query, &processed_results).await {
374 Ok(verification) => Some(verification),
375 Err(e) => {
376 log::warn!("LLM verification failed: {}", e);
377 None
378 }
379 }
380 } else {
381 None
382 }
383 } else {
384 None
385 };
386
387 let summary = self.generate_summary(query, &processed_results).await?;
389
390 let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
392
393 let sources: Vec<String> = processed_results
394 .iter()
395 .map(|r| r.source_domain.clone())
396 .collect::<std::collections::HashSet<_>>()
397 .into_iter()
398 .collect();
399
400 let search_time = start_time.elapsed().as_millis() as u64;
401
402 Ok(DocumentationSearchResponse {
403 query: query.to_string(),
404 summary,
405 results: processed_results,
406 official_results_count: final_official_count,
407 used_fallback,
408 total_found: all_results.len(),
409 search_time_ms: search_time,
410 sources,
411 used_llm_verification: verification_result.is_some(),
412 verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
413 })
414 }
415
416 async fn generate_summary(
418 &self,
419 query: &str,
420 results: &[ProcessedSearchResult],
421 ) -> Result<String> {
422 if results.is_empty() {
423 return Ok("No relevant documentation found".to_string());
424 }
425
426 if let Some(ref llm_client) = self.llm_client {
428 if llm_client.is_available() {
429 let _context = results
430 .iter()
431 .take(3) .map(|r| {
433 format!(
434 "Source: {} ({})\nContent: {}",
435 r.source_domain,
436 if r.is_official {
437 "Official"
438 } else {
439 "Community"
440 },
441 r.snippet
442 )
443 })
444 .collect::<Vec<_>>()
445 .join("\n\n");
446
447 let mock_results: Vec<crate::rag::RagSearchResult> = results
449 .iter()
450 .take(3)
451 .map(|r| crate::rag::RagSearchResult {
452 id: r.url.clone(),
453 content: r.snippet.clone(),
454 source_path: std::path::PathBuf::from(&r.url),
455 source_type: if r.is_official {
456 crate::rag::SourceType::Curated
457 } else {
458 crate::rag::SourceType::Remote
459 },
460 title: Some(r.title.clone()),
461 section: None,
462 score: r.final_score,
463 chunk_index: 0,
464 metadata: crate::rag::DocumentMetadata {
465 file_type: "web".to_string(),
466 size: r.snippet.len() as u64,
467 modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
468 tags: vec!["documentation".to_string()],
469 language: Some("en".to_string()),
470 },
471 })
472 .collect();
473
474 match llm_client.synthesize_answer(query, &mock_results).await {
475 Ok(response) => return Ok(response.answer),
476 Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
477 }
478 }
479 }
480
481 let official_count = results.iter().filter(|r| r.is_official).count();
483 let summary_prefix = if official_count > 0 {
484 format!("From {} official sources", official_count)
485 } else {
486 "From community sources".to_string()
487 };
488
489 let top_content = results
490 .iter()
491 .take(2)
492 .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
493 .collect::<Vec<_>>()
494 .join(". ");
495
496 Ok(format!("{}: {}", summary_prefix, top_content))
497 }
498
499 pub fn is_available(&self) -> bool {
501 self.config.enabled
502 }
503
504 pub fn config(&self) -> &WebSearchConfig {
506 &self.config
507 }
508
509 fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
511 let dev_domains = [
512 "github.com",
513 "stackoverflow.com",
514 "docs.rs",
515 "developer.mozilla.org",
516 "reactjs.org",
517 "nodejs.org",
518 "python.org",
519 "rust-lang.org",
520 "tauri.app",
521 "electronjs.org",
522 "dev.to",
523 "medium.com/@",
524 ];
525
526 let mut all_sites = framework_sites.to_vec();
528 all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
529
530 all_sites.sort();
532 all_sites.dedup();
533
534 let site_filters: String = all_sites
536 .iter()
537 .map(|site| format!("site:{}", site))
538 .collect::<Vec<_>>()
539 .join(" OR ");
540
541 format!("({}) {}", site_filters, query)
542 }
543
544 fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
546 let mut dev_query = query.to_string();
547
548 for framework in frameworks {
550 if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
551 dev_query = format!("{} {}", framework, dev_query);
552 }
553 }
554
555 let tech_domains = [
557 "site:github.com",
558 "site:stackoverflow.com",
559 "site:docs.rs",
560 "site:developer.mozilla.org",
561 "site:dev.to",
562 ];
563
564 format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
566 }
567
568 fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
570 !analysis.detected_frameworks.is_empty()
572 || analysis
573 .domain_context
574 .primary_domain
575 .contains("development")
576 || analysis
577 .domain_context
578 .primary_domain
579 .contains("programming")
580 || analysis.query_type == query_analyzer::QueryType::Reference
581 || analysis.original_query.to_lowercase().contains("api")
582 || analysis.original_query.to_lowercase().contains("code")
583 || analysis.original_query.to_lowercase().contains("library")
584 || analysis.original_query.to_lowercase().contains("function")
585 || analysis.original_query.to_lowercase().contains("method")
586 || analysis.original_query.to_lowercase().contains("component")
587 }
588}