1use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod query_analyzer;
17pub mod result_processor;
18pub mod search_engine;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct WebSearchConfig {
23 pub enabled: bool,
24 pub max_results: usize,
25 pub similarity_threshold: f32,
26 pub search_timeout_seconds: u64,
27 pub user_agent: String,
28 pub min_official_results: usize, }
30
31impl Default for WebSearchConfig {
32 fn default() -> Self {
33 Self {
34 enabled: true,
35 max_results: 8,
36 similarity_threshold: 0.6,
37 search_timeout_seconds: 10,
38 user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
39 .to_string(),
40 min_official_results: 3,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct RawSearchResult {
48 pub title: String,
49 pub url: String,
50 pub snippet: String,
51 pub source_domain: String,
52 pub timestamp: Option<DateTime<Utc>>,
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct ProcessedSearchResult {
58 pub title: String,
59 pub url: String,
60 pub snippet: String,
61 pub source_domain: String,
62 pub is_official: bool,
63 pub source_tier: u8, pub similarity_score: f32,
65 pub final_score: f32, pub timestamp: Option<DateTime<Utc>>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct DocumentationSearchResponse {
72 pub query: String,
73 pub summary: String,
74 pub results: Vec<ProcessedSearchResult>,
75 pub official_results_count: usize,
76 pub used_fallback: bool,
77 pub total_found: usize,
78 pub search_time_ms: u64,
79 pub sources: Vec<String>,
80 pub used_llm_verification: bool,
81 pub verification_passed: Option<bool>,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct VerificationResult {
87 pub is_authentic: bool,
88 pub confidence: f32,
89 pub reasoning: String,
90 pub suggested_refinement: Option<String>, }
92
93pub struct DocumentationSearchSystem {
95 config: WebSearchConfig,
96 embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
97 llm_client: Option<crate::rag::llm::LlmClient>,
98 official_sources: official_sources::OfficialSourceManager,
99 query_analyzer: query_analyzer::QueryAnalyzer,
100}
101
102impl DocumentationSearchSystem {
103 pub async fn new(
105 config: WebSearchConfig,
106 llm_config: Option<crate::rag::llm::LlmConfig>,
107 embedding_config: Option<crate::rag::EmbeddingConfig>,
108 ) -> Result<Self> {
109 if !config.enabled {
110 return Err(anyhow!("Documentation search is disabled"));
111 }
112
113 let embedding_model = match match &embedding_config {
115 Some(cfg) => crate::rag::embeddings::EmbeddingModel::new_with_config(cfg.clone()).await,
116 None => crate::rag::embeddings::EmbeddingModel::new().await,
117 } {
118 Ok(model) => {
119 log::info!("Semantic embeddings initialized for search");
120 Some(model)
121 }
122 Err(e) => {
123 log::warn!(
124 "Semantic embeddings unavailable, using text matching: {}",
125 e
126 );
127 None
128 }
129 };
130
131 let llm_client = if let Some(llm_cfg) = llm_config {
133 match crate::rag::llm::LlmClient::new(llm_cfg) {
134 Ok(client) => {
135 log::info!("LLM client initialized for result verification");
136 Some(client)
137 }
138 Err(e) => {
139 log::warn!("LLM client unavailable: {}", e);
140 None
141 }
142 }
143 } else {
144 None
145 };
146
147 let official_sources = official_sources::OfficialSourceManager::new();
148 let query_analyzer = query_analyzer::QueryAnalyzer::new();
149
150 Ok(Self {
151 config,
152 embedding_model,
153 llm_client,
154 official_sources,
155 query_analyzer,
156 })
157 }
158
159 pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
161 let start_time = std::time::Instant::now();
162
163 log::info!("🔍 Searching official documentation for: {}", query);
164
165 let query_analysis = self
167 .query_analyzer
168 .analyze_query(query, self.llm_client.as_ref())
169 .await?;
170 log::info!(
171 "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
172 query_analysis.original_query,
173 query_analysis.enhanced_query,
174 query_analysis.confidence * 100.0
175 );
176
177 let search_query = &query_analysis.enhanced_query;
179 fn extract_key_phrase(q: &str) -> Option<String> {
181 let q = q.to_lowercase();
182 if let Some(start) = q.find('"') {
183 if let Some(end_rel) = q[start + 1..].find('"') {
184 let end = start + 1 + end_rel;
185 let phrase = &q[start + 1..end];
186 if !phrase.trim().is_empty() {
187 return Some(phrase.trim().to_string());
188 }
189 }
190 }
191 let stop: std::collections::HashSet<&str> = [
193 "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with",
194 "using", "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
195 ]
196 .into_iter()
197 .collect();
198 let content: Vec<&str> = q
199 .split_whitespace()
200 .filter(|w| !stop.contains(*w))
201 .collect();
202 if content.len() >= 2 {
203 Some(format!("{} {}", content[0], content[1]))
204 } else {
205 None
206 }
207 }
208 let phrase_query = if let Some(p) = extract_key_phrase(&query_analysis.original_query) {
209 format!("\"{}\" {}", p, search_query)
210 } else {
211 search_query.to_string()
212 };
213
214 let official_query = if self.llm_client.is_some() {
216 match &query_analysis.search_strategy {
217 query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
218 log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
219 self.build_technical_search_query(&phrase_query, sites)
220 }
221 query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
222 log::info!(
223 "📚 Using LLM-enhanced prioritized search for: {}",
224 frameworks.join(", ")
225 );
226 self.build_dev_focused_query(&phrase_query, frameworks)
227 }
228 _ => {
229 if self.is_technical_query(&query_analysis) {
230 log::info!("🔧 Using LLM-enhanced technical search");
231 self.build_dev_focused_query(&phrase_query, &[])
232 } else {
233 self.official_sources.build_official_query(&phrase_query)
234 }
235 }
236 }
237 } else {
238 log::debug!("Using standard search (no LLM configured)");
240 self.official_sources.build_official_query(&phrase_query)
241 };
242 let mut all_results = search_engine::search_duckduckgo(
243 &official_query,
244 self.config.max_results,
245 &self.config.user_agent,
246 self.config.search_timeout_seconds,
247 )
248 .await?;
249
250 let mut used_fallback = false;
251
252 let official_results_count = all_results
254 .iter()
255 .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
256 .count();
257
258 if official_results_count < self.config.min_official_results {
260 log::info!(
261 "⚠️ Only {} official results found, expanding search...",
262 official_results_count
263 );
264 used_fallback = true;
265
266 let fallback_results = search_engine::search_duckduckgo(
268 &phrase_query,
269 self.config.max_results,
270 &self.config.user_agent,
271 self.config.search_timeout_seconds,
272 )
273 .await?;
274
275 for result in fallback_results {
277 if !all_results.iter().any(|r| r.url == result.url) {
278 all_results.push(result);
279 }
280 }
281 }
282
283 if all_results.is_empty() {
284 return Ok(DocumentationSearchResponse {
285 query: query.to_string(),
286 summary: "No relevant documentation found".to_string(),
287 results: vec![],
288 official_results_count: 0,
289 used_fallback: false,
290 total_found: 0,
291 search_time_ms: start_time.elapsed().as_millis() as u64,
292 sources: vec![],
293 used_llm_verification: false,
294 verification_passed: None,
295 });
296 }
297
298 let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
300 result_processor::process_with_embeddings_and_analysis(
301 &query_analysis,
302 &all_results,
303 embedding_model,
304 &self.official_sources,
305 self.config.similarity_threshold,
306 )
307 .await?
308 } else {
309 result_processor::process_without_embeddings(
310 query,
311 &all_results,
312 &self.official_sources,
313 )
314 };
315
316 if processed_results.is_empty() {
318 log::info!("No results after semantic filtering; retrying with text matching fallback");
319 processed_results = result_processor::process_without_embeddings(
320 query,
321 &all_results,
322 &self.official_sources,
323 );
324 }
325
326 result_processor::enhance_results(&mut processed_results, &self.official_sources);
328
329 for result in &processed_results {
331 let tier = self
332 .official_sources
333 .get_source_tier(&result.source_domain, &result.url);
334 log::debug!(
335 "Source: {} - Tier: {} - Score: {}",
336 result.source_domain,
337 self.official_sources.get_tier_description(&tier),
338 result.final_score
339 );
340 }
341
342 processed_results = result_processor::filter_non_technical_domains(
344 processed_results,
345 &query_analysis,
346 self.llm_client.is_some(),
347 );
348
349 processed_results = result_processor::filter_quality_results(processed_results, 20);
351
352 let mut processed_results = result_processor::deduplicate_results(processed_results);
354
355 if processed_results.is_empty() {
357 log::info!("No results after filtering; retrying with softer text-based processing");
358 let mut soft_results = result_processor::process_without_embeddings(
359 query,
360 &all_results,
361 &self.official_sources,
362 );
363 soft_results = result_processor::filter_quality_results(soft_results, 10);
365 processed_results = result_processor::deduplicate_results(soft_results);
366 }
367
368 let verification_result = if let Some(ref llm_client) = self.llm_client {
370 if llm_client.is_available() {
371 log::info!("Verifying results with LLM");
372 match llm_verifier::verify_search_results(query, &processed_results).await {
373 Ok(verification) => Some(verification),
374 Err(e) => {
375 log::warn!("LLM verification failed: {}", e);
376 None
377 }
378 }
379 } else {
380 None
381 }
382 } else {
383 None
384 };
385
386 let summary = self.generate_summary(query, &processed_results).await?;
388
389 let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
391
392 let sources: Vec<String> = processed_results
393 .iter()
394 .map(|r| r.source_domain.clone())
395 .collect::<std::collections::HashSet<_>>()
396 .into_iter()
397 .collect();
398
399 let search_time = start_time.elapsed().as_millis() as u64;
400
401 Ok(DocumentationSearchResponse {
402 query: query.to_string(),
403 summary,
404 results: processed_results,
405 official_results_count: final_official_count,
406 used_fallback,
407 total_found: all_results.len(),
408 search_time_ms: search_time,
409 sources,
410 used_llm_verification: verification_result.is_some(),
411 verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
412 })
413 }
414
415 async fn generate_summary(
417 &self,
418 query: &str,
419 results: &[ProcessedSearchResult],
420 ) -> Result<String> {
421 if results.is_empty() {
422 return Ok("No relevant documentation found".to_string());
423 }
424
425 if let Some(ref llm_client) = self.llm_client {
427 if llm_client.is_available() {
428 let _context = results
429 .iter()
430 .take(3) .map(|r| {
432 format!(
433 "Source: {} ({})\nContent: {}",
434 r.source_domain,
435 if r.is_official {
436 "Official"
437 } else {
438 "Community"
439 },
440 r.snippet
441 )
442 })
443 .collect::<Vec<_>>()
444 .join("\n\n");
445
446 let mock_results: Vec<crate::rag::RagSearchResult> = results
448 .iter()
449 .take(3)
450 .map(|r| crate::rag::RagSearchResult {
451 id: r.url.clone(),
452 content: r.snippet.clone(),
453 source_path: std::path::PathBuf::from(&r.url),
454 source_type: if r.is_official {
455 crate::rag::SourceType::Curated
456 } else {
457 crate::rag::SourceType::Remote
458 },
459 title: Some(r.title.clone()),
460 section: None,
461 score: r.final_score,
462 chunk_index: 0,
463 metadata: crate::rag::DocumentMetadata {
464 file_type: "web".to_string(),
465 size: r.snippet.len() as u64,
466 modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
467 tags: vec!["documentation".to_string()],
468 language: Some("en".to_string()),
469 },
470 })
471 .collect();
472
473 match llm_client.synthesize_answer(query, &mock_results).await {
474 Ok(response) => return Ok(response.answer),
475 Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
476 }
477 }
478 }
479
480 let official_count = results.iter().filter(|r| r.is_official).count();
482 let summary_prefix = if official_count > 0 {
483 format!("From {} official sources", official_count)
484 } else {
485 "From community sources".to_string()
486 };
487
488 let top_content = results
489 .iter()
490 .take(2)
491 .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
492 .collect::<Vec<_>>()
493 .join(". ");
494
495 Ok(format!("{}: {}", summary_prefix, top_content))
496 }
497
498 pub fn is_available(&self) -> bool {
500 self.config.enabled
501 }
502
503 pub fn config(&self) -> &WebSearchConfig {
505 &self.config
506 }
507
508 fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
510 let dev_domains = [
511 "github.com",
512 "stackoverflow.com",
513 "docs.rs",
514 "developer.mozilla.org",
515 "reactjs.org",
516 "nodejs.org",
517 "python.org",
518 "rust-lang.org",
519 "tauri.app",
520 "electronjs.org",
521 "dev.to",
522 "medium.com/@",
523 ];
524
525 let mut all_sites = framework_sites.to_vec();
527 all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
528
529 all_sites.sort();
531 all_sites.dedup();
532
533 let site_filters: String = all_sites
535 .iter()
536 .map(|site| format!("site:{}", site))
537 .collect::<Vec<_>>()
538 .join(" OR ");
539
540 format!("({}) {}", site_filters, query)
541 }
542
543 fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
545 let mut dev_query = query.to_string();
546
547 for framework in frameworks {
549 if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
550 dev_query = format!("{} {}", framework, dev_query);
551 }
552 }
553
554 let tech_domains = [
556 "site:github.com",
557 "site:stackoverflow.com",
558 "site:docs.rs",
559 "site:developer.mozilla.org",
560 "site:dev.to",
561 ];
562
563 format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
565 }
566
567 fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
569 !analysis.detected_frameworks.is_empty()
571 || analysis
572 .domain_context
573 .primary_domain
574 .contains("development")
575 || analysis
576 .domain_context
577 .primary_domain
578 .contains("programming")
579 || analysis.query_type == query_analyzer::QueryType::Reference
580 || analysis.original_query.to_lowercase().contains("api")
581 || analysis.original_query.to_lowercase().contains("code")
582 || analysis.original_query.to_lowercase().contains("library")
583 || analysis.original_query.to_lowercase().contains("function")
584 || analysis.original_query.to_lowercase().contains("method")
585 || analysis.original_query.to_lowercase().contains("component")
586 }
587}