1use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod query_analyzer;
17pub mod result_processor;
18pub mod search_engine;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct WebSearchConfig {
23 pub enabled: bool,
24 pub max_results: usize,
25 pub similarity_threshold: f32,
26 pub search_timeout_seconds: u64,
27 pub user_agent: String,
28 pub min_official_results: usize, }
30
31impl Default for WebSearchConfig {
32 fn default() -> Self {
33 Self {
34 enabled: true,
35 max_results: 8,
36 similarity_threshold: 0.6,
37 search_timeout_seconds: 10,
38 user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
39 .to_string(),
40 min_official_results: 3,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct RawSearchResult {
48 pub title: String,
49 pub url: String,
50 pub snippet: String,
51 pub source_domain: String,
52 pub timestamp: Option<DateTime<Utc>>,
53}
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct ProcessedSearchResult {
58 pub title: String,
59 pub url: String,
60 pub snippet: String,
61 pub source_domain: String,
62 pub is_official: bool,
63 pub source_tier: u8, pub similarity_score: f32,
65 pub final_score: f32, pub timestamp: Option<DateTime<Utc>>,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct DocumentationSearchResponse {
72 pub query: String,
73 pub summary: String,
74 pub results: Vec<ProcessedSearchResult>,
75 pub official_results_count: usize,
76 pub used_fallback: bool,
77 pub total_found: usize,
78 pub search_time_ms: u64,
79 pub sources: Vec<String>,
80 pub used_llm_verification: bool,
81 pub verification_passed: Option<bool>,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct VerificationResult {
87 pub is_authentic: bool,
88 pub confidence: f32,
89 pub reasoning: String,
90 pub suggested_refinement: Option<String>, }
92
93pub struct DocumentationSearchSystem {
95 config: WebSearchConfig,
96 embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
97 llm_client: Option<crate::rag::llm::LlmClient>,
98 official_sources: official_sources::OfficialSourceManager,
99 query_analyzer: query_analyzer::QueryAnalyzer,
100}
101
102impl DocumentationSearchSystem {
103 pub async fn new(
105 config: WebSearchConfig,
106 llm_config: Option<crate::rag::llm::LlmConfig>,
107 ) -> Result<Self> {
108 if !config.enabled {
109 return Err(anyhow!("Documentation search is disabled"));
110 }
111
112 let embedding_model = match crate::rag::embeddings::EmbeddingModel::new().await {
114 Ok(model) => {
115 log::info!("Semantic embeddings initialized for search");
116 Some(model)
117 }
118 Err(e) => {
119 log::warn!(
120 "Semantic embeddings unavailable, using text matching: {}",
121 e
122 );
123 None
124 }
125 };
126
127 let llm_client = if let Some(llm_cfg) = llm_config {
129 match crate::rag::llm::LlmClient::new(llm_cfg) {
130 Ok(client) => {
131 log::info!("LLM client initialized for result verification");
132 Some(client)
133 }
134 Err(e) => {
135 log::warn!("LLM client unavailable: {}", e);
136 None
137 }
138 }
139 } else {
140 None
141 };
142
143 let official_sources = official_sources::OfficialSourceManager::new();
144 let query_analyzer = query_analyzer::QueryAnalyzer::new();
145
146 Ok(Self {
147 config,
148 embedding_model,
149 llm_client,
150 official_sources,
151 query_analyzer,
152 })
153 }
154
155 pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
157 let start_time = std::time::Instant::now();
158
159 log::info!("🔍 Searching official documentation for: {}", query);
160
161 let query_analysis = self
163 .query_analyzer
164 .analyze_query(query, self.llm_client.as_ref())
165 .await?;
166 log::info!(
167 "🧠 Query analysis: {} -> {} (confidence: {:.1}%)",
168 query_analysis.original_query,
169 query_analysis.enhanced_query,
170 query_analysis.confidence * 100.0
171 );
172
173 let search_query = &query_analysis.enhanced_query;
175
176 let official_query = if self.llm_client.is_some() {
178 match &query_analysis.search_strategy {
179 query_analyzer::SearchStrategy::FrameworkSpecific { framework, sites } => {
180 log::info!("🎯 Using LLM-enhanced framework search for {}", framework);
181 self.build_technical_search_query(search_query, sites)
182 }
183 query_analyzer::SearchStrategy::OfficialDocsFirst { frameworks } => {
184 log::info!(
185 "📚 Using LLM-enhanced prioritized search for: {}",
186 frameworks.join(", ")
187 );
188 self.build_dev_focused_query(search_query, frameworks)
189 }
190 _ => {
191 if self.is_technical_query(&query_analysis) {
192 log::info!("🔧 Using LLM-enhanced technical search");
193 self.build_dev_focused_query(search_query, &[])
194 } else {
195 self.official_sources.build_official_query(search_query)
196 }
197 }
198 }
199 } else {
200 log::debug!("Using standard search (no LLM configured)");
202 self.official_sources.build_official_query(search_query)
203 };
204 let mut all_results = search_engine::search_duckduckgo(
205 &official_query,
206 self.config.max_results,
207 &self.config.user_agent,
208 self.config.search_timeout_seconds,
209 )
210 .await?;
211
212 let mut used_fallback = false;
213
214 let official_results_count = all_results
216 .iter()
217 .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
218 .count();
219
220 if official_results_count < self.config.min_official_results {
222 log::info!(
223 "⚠️ Only {} official results found, expanding search...",
224 official_results_count
225 );
226 used_fallback = true;
227
228 let fallback_results = search_engine::search_duckduckgo(
230 query,
231 self.config.max_results,
232 &self.config.user_agent,
233 self.config.search_timeout_seconds,
234 )
235 .await?;
236
237 for result in fallback_results {
239 if !all_results.iter().any(|r| r.url == result.url) {
240 all_results.push(result);
241 }
242 }
243 }
244
245 if all_results.is_empty() {
246 return Ok(DocumentationSearchResponse {
247 query: query.to_string(),
248 summary: "No relevant documentation found".to_string(),
249 results: vec![],
250 official_results_count: 0,
251 used_fallback: false,
252 total_found: 0,
253 search_time_ms: start_time.elapsed().as_millis() as u64,
254 sources: vec![],
255 used_llm_verification: false,
256 verification_passed: None,
257 });
258 }
259
260 let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
262 result_processor::process_with_embeddings_and_analysis(
263 &query_analysis,
264 &all_results,
265 embedding_model,
266 &self.official_sources,
267 self.config.similarity_threshold,
268 )
269 .await?
270 } else {
271 result_processor::process_without_embeddings(
272 query,
273 &all_results,
274 &self.official_sources,
275 )
276 };
277
278 result_processor::enhance_results(&mut processed_results, &self.official_sources);
280
281 for result in &processed_results {
283 let tier = self
284 .official_sources
285 .get_source_tier(&result.source_domain, &result.url);
286 log::debug!(
287 "Source: {} - Tier: {} - Score: {}",
288 result.source_domain,
289 self.official_sources.get_tier_description(&tier),
290 result.final_score
291 );
292 }
293
294 processed_results = result_processor::filter_non_technical_domains(
296 processed_results,
297 &query_analysis,
298 self.llm_client.is_some(),
299 );
300
301 processed_results = result_processor::filter_quality_results(processed_results, 30);
303
304 let processed_results = result_processor::deduplicate_results(processed_results);
306
307 let verification_result = if let Some(ref llm_client) = self.llm_client {
309 if llm_client.is_available() {
310 log::info!("Verifying results with LLM");
311 match llm_verifier::verify_search_results(query, &processed_results, llm_client)
312 .await
313 {
314 Ok(verification) => Some(verification),
315 Err(e) => {
316 log::warn!("LLM verification failed: {}", e);
317 None
318 }
319 }
320 } else {
321 None
322 }
323 } else {
324 None
325 };
326
327 let summary = self.generate_summary(query, &processed_results).await?;
329
330 let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
332
333 let sources: Vec<String> = processed_results
334 .iter()
335 .map(|r| r.source_domain.clone())
336 .collect::<std::collections::HashSet<_>>()
337 .into_iter()
338 .collect();
339
340 let search_time = start_time.elapsed().as_millis() as u64;
341
342 Ok(DocumentationSearchResponse {
343 query: query.to_string(),
344 summary,
345 results: processed_results,
346 official_results_count: final_official_count,
347 used_fallback,
348 total_found: all_results.len(),
349 search_time_ms: search_time,
350 sources,
351 used_llm_verification: verification_result.is_some(),
352 verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
353 })
354 }
355
356 async fn generate_summary(
358 &self,
359 query: &str,
360 results: &[ProcessedSearchResult],
361 ) -> Result<String> {
362 if results.is_empty() {
363 return Ok("No relevant documentation found".to_string());
364 }
365
366 if let Some(ref llm_client) = self.llm_client {
368 if llm_client.is_available() {
369 let _context = results
370 .iter()
371 .take(3) .map(|r| {
373 format!(
374 "Source: {} ({})\nContent: {}",
375 r.source_domain,
376 if r.is_official {
377 "Official"
378 } else {
379 "Community"
380 },
381 r.snippet
382 )
383 })
384 .collect::<Vec<_>>()
385 .join("\n\n");
386
387 let mock_results: Vec<crate::rag::RagSearchResult> = results
389 .iter()
390 .take(3)
391 .map(|r| crate::rag::RagSearchResult {
392 id: r.url.clone(),
393 content: r.snippet.clone(),
394 source_path: std::path::PathBuf::from(&r.url),
395 source_type: if r.is_official {
396 crate::rag::SourceType::Curated
397 } else {
398 crate::rag::SourceType::Remote
399 },
400 title: Some(r.title.clone()),
401 section: None,
402 score: r.final_score,
403 chunk_index: 0,
404 metadata: crate::rag::DocumentMetadata {
405 file_type: "web".to_string(),
406 size: r.snippet.len() as u64,
407 modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
408 tags: vec!["documentation".to_string()],
409 language: Some("en".to_string()),
410 },
411 })
412 .collect();
413
414 match llm_client.synthesize_answer(query, &mock_results).await {
415 Ok(response) => return Ok(response.answer),
416 Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
417 }
418 }
419 }
420
421 let official_count = results.iter().filter(|r| r.is_official).count();
423 let summary_prefix = if official_count > 0 {
424 format!("From {} official sources", official_count)
425 } else {
426 "From community sources".to_string()
427 };
428
429 let top_content = results
430 .iter()
431 .take(2)
432 .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
433 .collect::<Vec<_>>()
434 .join(". ");
435
436 Ok(format!("{}: {}", summary_prefix, top_content))
437 }
438
439 pub fn is_available(&self) -> bool {
441 self.config.enabled
442 }
443
444 pub fn config(&self) -> &WebSearchConfig {
446 &self.config
447 }
448
449 fn build_technical_search_query(&self, query: &str, framework_sites: &[String]) -> String {
451 let dev_domains = [
452 "github.com",
453 "stackoverflow.com",
454 "docs.rs",
455 "developer.mozilla.org",
456 "reactjs.org",
457 "nodejs.org",
458 "python.org",
459 "rust-lang.org",
460 "tauri.app",
461 "electronjs.org",
462 "dev.to",
463 "medium.com/@",
464 ];
465
466 let mut all_sites = framework_sites.to_vec();
468 all_sites.extend(dev_domains.iter().map(|s| s.to_string()));
469
470 all_sites.sort();
472 all_sites.dedup();
473
474 let site_filters: String = all_sites
476 .iter()
477 .map(|site| format!("site:{}", site))
478 .collect::<Vec<_>>()
479 .join(" OR ");
480
481 format!("({}) {}", site_filters, query)
482 }
483
484 fn build_dev_focused_query(&self, query: &str, frameworks: &[String]) -> String {
486 let mut dev_query = query.to_string();
487
488 for framework in frameworks {
490 if !dev_query.to_lowercase().contains(&framework.to_lowercase()) {
491 dev_query = format!("{} {}", framework, dev_query);
492 }
493 }
494
495 let tech_domains = [
497 "site:github.com",
498 "site:stackoverflow.com",
499 "site:docs.rs",
500 "site:developer.mozilla.org",
501 "site:dev.to",
502 ];
503
504 format!("({}) OR {}", tech_domains.join(" OR "), dev_query)
506 }
507
508 fn is_technical_query(&self, analysis: &query_analyzer::QueryAnalysis) -> bool {
510 !analysis.detected_frameworks.is_empty()
512 || analysis
513 .domain_context
514 .primary_domain
515 .contains("development")
516 || analysis
517 .domain_context
518 .primary_domain
519 .contains("programming")
520 || analysis.query_type == query_analyzer::QueryType::Reference
521 || analysis.original_query.to_lowercase().contains("api")
522 || analysis.original_query.to_lowercase().contains("code")
523 || analysis.original_query.to_lowercase().contains("library")
524 || analysis.original_query.to_lowercase().contains("function")
525 || analysis.original_query.to_lowercase().contains("method")
526 || analysis.original_query.to_lowercase().contains("component")
527 }
528}