1use anyhow::{anyhow, Result};
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13
14pub mod llm_verifier;
15pub mod official_sources;
16pub mod result_processor;
17pub mod search_engine;
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct WebSearchConfig {
22 pub enabled: bool,
23 pub max_results: usize,
24 pub similarity_threshold: f32,
25 pub search_timeout_seconds: u64,
26 pub user_agent: String,
27 pub min_official_results: usize, }
29
30impl Default for WebSearchConfig {
31 fn default() -> Self {
32 Self {
33 enabled: true,
34 max_results: 8,
35 similarity_threshold: 0.6,
36 search_timeout_seconds: 10,
37 user_agent: "Manx/0.3.5 Documentation Finder (+https://github.com/neur0map/manx)"
38 .to_string(),
39 min_official_results: 3,
40 }
41 }
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct RawSearchResult {
47 pub title: String,
48 pub url: String,
49 pub snippet: String,
50 pub source_domain: String,
51 pub timestamp: Option<DateTime<Utc>>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct ProcessedSearchResult {
57 pub title: String,
58 pub url: String,
59 pub snippet: String,
60 pub source_domain: String,
61 pub is_official: bool,
62 pub source_tier: u8, pub similarity_score: f32,
64 pub final_score: f32, pub timestamp: Option<DateTime<Utc>>,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct DocumentationSearchResponse {
71 pub query: String,
72 pub summary: String,
73 pub results: Vec<ProcessedSearchResult>,
74 pub official_results_count: usize,
75 pub used_fallback: bool,
76 pub total_found: usize,
77 pub search_time_ms: u64,
78 pub sources: Vec<String>,
79 pub used_llm_verification: bool,
80 pub verification_passed: Option<bool>,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct VerificationResult {
86 pub is_authentic: bool,
87 pub confidence: f32,
88 pub reasoning: String,
89 pub suggested_refinement: Option<String>, }
91
92pub struct DocumentationSearchSystem {
94 config: WebSearchConfig,
95 embedding_model: Option<crate::rag::embeddings::EmbeddingModel>,
96 llm_client: Option<crate::rag::llm::LlmClient>,
97 official_sources: official_sources::OfficialSourceManager,
98}
99
100impl DocumentationSearchSystem {
101 pub async fn new(
103 config: WebSearchConfig,
104 llm_config: Option<crate::rag::llm::LlmConfig>,
105 ) -> Result<Self> {
106 if !config.enabled {
107 return Err(anyhow!("Documentation search is disabled"));
108 }
109
110 let embedding_model = match crate::rag::embeddings::EmbeddingModel::new().await {
112 Ok(model) => {
113 log::info!("Semantic embeddings initialized for search");
114 Some(model)
115 }
116 Err(e) => {
117 log::warn!(
118 "Semantic embeddings unavailable, using text matching: {}",
119 e
120 );
121 None
122 }
123 };
124
125 let llm_client = if let Some(llm_cfg) = llm_config {
127 match crate::rag::llm::LlmClient::new(llm_cfg) {
128 Ok(client) => {
129 log::info!("LLM client initialized for result verification");
130 Some(client)
131 }
132 Err(e) => {
133 log::warn!("LLM client unavailable: {}", e);
134 None
135 }
136 }
137 } else {
138 None
139 };
140
141 let official_sources = official_sources::OfficialSourceManager::new();
142
143 Ok(Self {
144 config,
145 embedding_model,
146 llm_client,
147 official_sources,
148 })
149 }
150
151 pub async fn search(&mut self, query: &str) -> Result<DocumentationSearchResponse> {
153 let start_time = std::time::Instant::now();
154
155 log::info!("🔍 Searching official documentation for: {}", query);
156
157 let official_query = self.official_sources.build_official_query(query);
159 let mut all_results = search_engine::search_duckduckgo(
160 &official_query,
161 self.config.max_results,
162 &self.config.user_agent,
163 self.config.search_timeout_seconds,
164 )
165 .await?;
166
167 let mut used_fallback = false;
168
169 let official_results_count = all_results
171 .iter()
172 .filter(|r| self.official_sources.is_official_domain(&r.source_domain))
173 .count();
174
175 if official_results_count < self.config.min_official_results {
177 log::info!(
178 "⚠️ Only {} official results found, expanding search...",
179 official_results_count
180 );
181 used_fallback = true;
182
183 let fallback_results = search_engine::search_duckduckgo(
185 query,
186 self.config.max_results,
187 &self.config.user_agent,
188 self.config.search_timeout_seconds,
189 )
190 .await?;
191
192 for result in fallback_results {
194 if !all_results.iter().any(|r| r.url == result.url) {
195 all_results.push(result);
196 }
197 }
198 }
199
200 if all_results.is_empty() {
201 return Ok(DocumentationSearchResponse {
202 query: query.to_string(),
203 summary: "No relevant documentation found".to_string(),
204 results: vec![],
205 official_results_count: 0,
206 used_fallback: false,
207 total_found: 0,
208 search_time_ms: start_time.elapsed().as_millis() as u64,
209 sources: vec![],
210 used_llm_verification: false,
211 verification_passed: None,
212 });
213 }
214
215 let mut processed_results = if let Some(ref embedding_model) = self.embedding_model {
217 result_processor::process_with_embeddings(
218 query,
219 &all_results,
220 embedding_model,
221 &self.official_sources,
222 self.config.similarity_threshold,
223 )
224 .await?
225 } else {
226 result_processor::process_without_embeddings(
227 query,
228 &all_results,
229 &self.official_sources,
230 )
231 };
232
233 result_processor::enhance_results(&mut processed_results, &self.official_sources);
235
236 for result in &processed_results {
238 let tier = self
239 .official_sources
240 .get_source_tier(&result.source_domain, &result.url);
241 log::debug!(
242 "Source: {} - Tier: {} - Score: {}",
243 result.source_domain,
244 self.official_sources.get_tier_description(&tier),
245 result.final_score
246 );
247 }
248
249 processed_results = result_processor::filter_quality_results(processed_results, 30);
251
252 let processed_results = result_processor::deduplicate_results(processed_results);
254
255 let verification_result = if let Some(ref llm_client) = self.llm_client {
257 if llm_client.is_available() {
258 log::info!("Verifying results with LLM");
259 match llm_verifier::verify_search_results(query, &processed_results, llm_client)
260 .await
261 {
262 Ok(verification) => Some(verification),
263 Err(e) => {
264 log::warn!("LLM verification failed: {}", e);
265 None
266 }
267 }
268 } else {
269 None
270 }
271 } else {
272 None
273 };
274
275 let summary = self.generate_summary(query, &processed_results).await?;
277
278 let final_official_count = processed_results.iter().filter(|r| r.is_official).count();
280
281 let sources: Vec<String> = processed_results
282 .iter()
283 .map(|r| r.source_domain.clone())
284 .collect::<std::collections::HashSet<_>>()
285 .into_iter()
286 .collect();
287
288 let search_time = start_time.elapsed().as_millis() as u64;
289
290 Ok(DocumentationSearchResponse {
291 query: query.to_string(),
292 summary,
293 results: processed_results,
294 official_results_count: final_official_count,
295 used_fallback,
296 total_found: all_results.len(),
297 search_time_ms: search_time,
298 sources,
299 used_llm_verification: verification_result.is_some(),
300 verification_passed: verification_result.as_ref().map(|v| v.is_authentic),
301 })
302 }
303
304 async fn generate_summary(
306 &self,
307 query: &str,
308 results: &[ProcessedSearchResult],
309 ) -> Result<String> {
310 if results.is_empty() {
311 return Ok("No relevant documentation found".to_string());
312 }
313
314 if let Some(ref llm_client) = self.llm_client {
316 if llm_client.is_available() {
317 let _context = results
318 .iter()
319 .take(3) .map(|r| {
321 format!(
322 "Source: {} ({})\nContent: {}",
323 r.source_domain,
324 if r.is_official {
325 "Official"
326 } else {
327 "Community"
328 },
329 r.snippet
330 )
331 })
332 .collect::<Vec<_>>()
333 .join("\n\n");
334
335 let mock_results: Vec<crate::rag::RagSearchResult> = results
337 .iter()
338 .take(3)
339 .map(|r| crate::rag::RagSearchResult {
340 id: r.url.clone(),
341 content: r.snippet.clone(),
342 source_path: std::path::PathBuf::from(&r.url),
343 source_type: if r.is_official {
344 crate::rag::SourceType::Curated
345 } else {
346 crate::rag::SourceType::Remote
347 },
348 title: Some(r.title.clone()),
349 section: None,
350 score: r.final_score,
351 metadata: crate::rag::DocumentMetadata {
352 file_type: "web".to_string(),
353 size: r.snippet.len() as u64,
354 modified: r.timestamp.unwrap_or_else(chrono::Utc::now),
355 tags: vec!["documentation".to_string()],
356 language: Some("en".to_string()),
357 },
358 })
359 .collect();
360
361 match llm_client.synthesize_answer(query, &mock_results).await {
362 Ok(response) => return Ok(response.answer),
363 Err(e) => log::warn!("LLM summarization failed, using fallback: {}", e),
364 }
365 }
366 }
367
368 let official_count = results.iter().filter(|r| r.is_official).count();
370 let summary_prefix = if official_count > 0 {
371 format!("From {} official sources", official_count)
372 } else {
373 "From community sources".to_string()
374 };
375
376 let top_content = results
377 .iter()
378 .take(2)
379 .map(|r| r.snippet.split('.').next().unwrap_or(&r.snippet))
380 .collect::<Vec<_>>()
381 .join(". ");
382
383 Ok(format!("{}: {}", summary_prefix, top_content))
384 }
385
386 pub fn is_available(&self) -> bool {
388 self.config.enabled
389 }
390
391 pub fn config(&self) -> &WebSearchConfig {
393 &self.config
394 }
395}