1use crate::rag::embeddings::EmbeddingModel;
9use crate::web_search::official_sources::{OfficialSourceManager, SourceTier};
10use crate::web_search::{query_analyzer, ProcessedSearchResult, RawSearchResult};
11use anyhow::Result;
12use std::collections::HashSet;
13
14fn extract_key_phrase(query: &str) -> Option<String> {
15 let q = query.to_lowercase();
16 if let Some(start) = q.find('"') {
18 if let Some(end_rel) = q[start + 1..].find('"') {
19 let end = start + 1 + end_rel;
20 let phrase = &q[start + 1..end];
21 if !phrase.trim().is_empty() {
22 return Some(phrase.trim().to_string());
23 }
24 }
25 }
26 let stopwords: std::collections::HashSet<&str> = [
28 "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with", "using",
29 "is", "are", "be", "this", "that", "it", "from", "by", "into", "as", "about", "write",
30 ]
31 .into_iter()
32 .collect();
33 let content: Vec<&str> = q
34 .split_whitespace()
35 .filter(|w| !stopwords.contains(*w))
36 .collect();
37 if content.len() >= 2 {
38 Some(format!("{} {}", content[0], content[1]))
39 } else {
40 None
41 }
42}
43
44pub async fn process_with_embeddings_and_analysis(
46 query_analysis: &query_analyzer::QueryAnalysis,
47 raw_results: &[RawSearchResult],
48 embedding_model: &EmbeddingModel,
49 official_sources: &OfficialSourceManager,
50 similarity_threshold: f32,
51) -> Result<Vec<ProcessedSearchResult>> {
52 log::info!(
53 "Processing {} results with semantic embeddings + query analysis (framework: {:?})",
54 raw_results.len(),
55 query_analysis.detected_frameworks.first().map(|f| &f.name)
56 );
57
58 let embedding_query = &query_analysis.enhanced_query;
60 let query_embedding = embedding_model.embed_text(embedding_query).await?;
61 let key_phrase = extract_key_phrase(&query_analysis.original_query);
62
63 let mut processed_results = Vec::new();
64
65 for (index, result) in raw_results.iter().enumerate() {
66 let mut combined_text = format!("{} {}", result.title, result.snippet);
68
69 for framework in &query_analysis.detected_frameworks {
71 combined_text.push_str(&format!(" {}", framework.name));
72 }
73
74 for keyword in &query_analysis.domain_context.context_keywords {
76 combined_text.push_str(&format!(" {}", keyword));
77 }
78
79 let result_embedding = match embedding_model.embed_text(&combined_text).await {
81 Ok(embedding) => embedding,
82 Err(e) => {
83 log::warn!("Failed to embed result {}: {}", index, e);
84 continue;
85 }
86 };
87
88 let mut similarity_score =
90 EmbeddingModel::cosine_similarity(&query_embedding, &result_embedding);
91
92 if let Some(ref phrase) = key_phrase {
94 let haystack = combined_text.to_lowercase();
95 if haystack.contains(phrase) {
96 similarity_score *= 1.2; } else if matches!(
98 query_analysis.query_type,
99 query_analyzer::QueryType::Example | query_analyzer::QueryType::HowTo
100 ) {
101 similarity_score *= 0.85; }
103 similarity_score = similarity_score.min(1.0);
104 }
105
106 let adjusted_threshold = if !query_analysis.detected_frameworks.is_empty() {
108 similarity_threshold * 0.8
110 } else {
111 similarity_threshold
112 };
113
114 if similarity_score < adjusted_threshold {
116 log::debug!(
117 "Filtering out result with low similarity: {} (score: {:.3}, threshold: {:.3})",
118 result.title,
119 similarity_score,
120 adjusted_threshold
121 );
122 continue;
123 }
124
125 let source_tier = official_sources.get_source_tier(&result.source_domain, &result.url);
127 let is_official = matches!(
128 source_tier,
129 SourceTier::OfficialDocs | SourceTier::OfficialRepos
130 );
131
132 let mut source_boost = official_sources.get_score_boost(&source_tier);
134
135 for framework in &query_analysis.detected_frameworks {
137 if framework
138 .official_sites
139 .iter()
140 .any(|site| result.source_domain.contains(site))
141 {
142 source_boost *= 1.5; log::debug!("Applied framework domain boost for {}", framework.name);
144 }
145 }
146
147 let type_boost = match query_analysis.query_type {
149 query_analyzer::QueryType::Reference => 1.2, query_analyzer::QueryType::Example => 1.1, query_analyzer::QueryType::Troubleshoot => 0.9, _ => 1.0,
153 };
154
155 let final_score = similarity_score * source_boost * type_boost;
157
158 processed_results.push(ProcessedSearchResult {
159 title: result.title.clone(),
160 url: result.url.clone(),
161 snippet: result.snippet.clone(),
162 source_domain: result.source_domain.clone(),
163 is_official,
164 source_tier: source_tier as u8,
165 similarity_score,
166 final_score,
167 timestamp: result.timestamp,
168 });
169
170 log::debug!(
171 "Enhanced result: {} | Similarity: {:.3} | Source boost: {:.1}x | Type boost: {:.1}x | Final: {:.3}",
172 result.source_domain,
173 similarity_score,
174 source_boost,
175 type_boost,
176 final_score
177 );
178 }
179
180 processed_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
182
183 log::info!(
184 "Enhanced processing: {} relevant results (filtered {} below threshold)",
185 processed_results.len(),
186 raw_results.len() - processed_results.len()
187 );
188
189 Ok(processed_results)
190}
191
192pub fn filter_non_technical_domains(
194 results: Vec<ProcessedSearchResult>,
195 query_analysis: &query_analyzer::QueryAnalysis,
196 has_llm: bool,
197) -> Vec<ProcessedSearchResult> {
198 if !has_llm || query_analysis.detected_frameworks.is_empty() {
199 return results;
201 }
202
203 let non_technical_domains = vec![
205 "amazon.com",
206 "ebay.com",
207 "etsy.com",
208 "walmart.com",
209 "target.com",
210 "houzz.com",
211 "wayfair.com",
212 "overstock.com",
213 "perigold.com",
214 "safavieh.com",
215 "furniture.com",
216 "shopping.com",
217 "bestbuy.com",
218 "lowes.com",
219 "homedepot.com",
220 ];
221
222 let original_count = results.len();
223 let filtered_results: Vec<ProcessedSearchResult> = results
224 .into_iter()
225 .filter(|result| {
226 let domain_lower = result.source_domain.to_lowercase();
227
228 let is_non_technical = non_technical_domains
230 .iter()
231 .any(|nt_domain| domain_lower.contains(nt_domain));
232
233 if is_non_technical {
234 log::debug!(
235 "LLM filter: Removing non-technical result: {} from {}",
236 result.title,
237 result.source_domain
238 );
239 false
240 } else {
241 true
242 }
243 })
244 .collect();
245
246 let filtered_count = original_count - filtered_results.len();
247 if filtered_count > 0 {
248 log::info!(
249 "🧠LLM-enhanced filtering: Removed {} non-technical results (e.g., shopping, furniture)",
250 filtered_count
251 );
252 }
253
254 filtered_results
255}
256
257pub fn process_without_embeddings(
259 query: &str,
260 raw_results: &[RawSearchResult],
261 official_sources: &OfficialSourceManager,
262) -> Vec<ProcessedSearchResult> {
263 log::info!(
264 "Processing {} results with text matching (no embeddings)",
265 raw_results.len()
266 );
267
268 let query_lower = query.to_lowercase();
269 let stopwords: HashSet<&str> = [
271 "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with", "using",
272 "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
273 ]
274 .into_iter()
275 .collect();
276
277 let mut query_words: Vec<&str> = query_lower
278 .split_whitespace()
279 .filter(|w| !stopwords.contains(*w))
280 .collect();
281 query_words.dedup();
282 let key_phrase = extract_key_phrase(query);
283 let mut processed_results = Vec::new();
284
285 for result in raw_results {
286 let combined_text = format!("{} {}", result.title, result.snippet).to_lowercase();
288
289 let word_matches = query_words
291 .iter()
292 .filter(|word| combined_text.contains(*word))
293 .count();
294
295 let mut similarity_score = if query_words.is_empty() {
297 0.3 } else {
299 word_matches as f32 / query_words.len() as f32
300 };
301
302 if let Some(ref phrase) = key_phrase {
304 if combined_text.contains(phrase) {
305 similarity_score = (similarity_score + 0.2).min(1.0);
306 } else {
307 similarity_score = (similarity_score - 0.1).max(0.0);
308 }
309 }
310
311 if similarity_score < 0.3 {
313 continue;
314 }
315
316 let source_tier = official_sources.get_source_tier(&result.source_domain, &result.url);
318 let is_official = matches!(
319 source_tier,
320 SourceTier::OfficialDocs | SourceTier::OfficialRepos
321 );
322
323 let source_boost = official_sources.get_score_boost(&source_tier);
325
326 let final_score = similarity_score * source_boost;
328
329 processed_results.push(ProcessedSearchResult {
330 title: result.title.clone(),
331 url: result.url.clone(),
332 snippet: result.snippet.clone(),
333 source_domain: result.source_domain.clone(),
334 is_official,
335 source_tier: source_tier as u8,
336 similarity_score,
337 final_score,
338 timestamp: result.timestamp,
339 });
340 }
341
342 processed_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
344
345 log::info!(
346 "Processed {} results with text matching",
347 processed_results.len()
348 );
349 processed_results
350}
351
352pub fn enhance_results(
354 processed_results: &mut [ProcessedSearchResult],
355 _official_sources: &OfficialSourceManager,
356) {
357 for result in processed_results.iter_mut() {
358 if result.url.contains("/docs/") || result.url.contains("/documentation/") {
360 } else if result.url.contains("/api/") {
362 } else if result.url.contains("/tutorial") || result.url.contains("/guide") {
364 }
366
367 }
370}
371
372pub fn filter_quality_results(
374 processed_results: Vec<ProcessedSearchResult>,
375 min_snippet_length: usize,
376) -> Vec<ProcessedSearchResult> {
377 processed_results
378 .into_iter()
379 .filter(|result| {
380 if result.snippet.len() < min_snippet_length {
382 log::debug!("Filtering short snippet: {}", result.title);
383 return false;
384 }
385
386 let snippet_lower = result.snippet.to_lowercase();
388 if snippet_lower.contains("lorem ipsum")
389 || snippet_lower.contains("click here for more")
390 || snippet_lower.contains("subscribe now")
391 {
392 log::debug!("Filtering low-quality content: {}", result.title);
393 return false;
394 }
395
396 if result.snippet.matches("http").count() > 3 {
398 log::debug!("Filtering link-heavy content: {}", result.title);
399 return false;
400 }
401
402 true
403 })
404 .collect()
405}
406
407pub fn deduplicate_results(
409 mut processed_results: Vec<ProcessedSearchResult>,
410) -> Vec<ProcessedSearchResult> {
411 processed_results.sort_by(|a, b| {
413 let domain_cmp = a.source_domain.cmp(&b.source_domain);
414 if domain_cmp == std::cmp::Ordering::Equal {
415 a.title.cmp(&b.title)
416 } else {
417 domain_cmp
418 }
419 });
420
421 let mut unique_results = Vec::new();
422 let mut last_domain = String::new();
423 let mut last_title_words = Vec::new();
424
425 let result_count = processed_results.len();
426 for result in &processed_results {
427 let current_title_words: Vec<&str> = result.title.split_whitespace().take(5).collect();
428
429 let is_duplicate = result.source_domain == last_domain
431 && title_similarity(¤t_title_words, &last_title_words) > 0.8;
432
433 if !is_duplicate {
434 unique_results.push(result.clone());
435 } else {
436 log::debug!(
437 "Removing duplicate: {} from {}",
438 result.title,
439 result.source_domain
440 );
441 }
442
443 last_domain = result.source_domain.clone();
444 last_title_words = current_title_words
445 .into_iter()
446 .map(|s| s.to_string())
447 .collect();
448 }
449
450 unique_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
452
453 log::info!(
454 "Deduplicated results: {} -> {}",
455 result_count,
456 unique_results.len()
457 );
458
459 unique_results
460}
461
462fn title_similarity(words1: &[&str], words2: &[String]) -> f32 {
464 if words1.is_empty() || words2.is_empty() {
465 return 0.0;
466 }
467
468 let matches = words1
469 .iter()
470 .filter(|word1| {
471 words2
472 .iter()
473 .any(|word2| word1.to_lowercase() == word2.to_lowercase())
474 })
475 .count();
476
477 matches as f32 / words1.len().max(words2.len()) as f32
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483 use chrono::Utc;
484
485 #[test]
486 fn test_process_without_embeddings() {
487 let official_sources = OfficialSourceManager::new();
488
489 let raw_results = vec![
490 RawSearchResult {
491 title: "Python Documentation".to_string(),
492 url: "https://docs.python.org/3/".to_string(),
493 snippet: "Python programming language documentation".to_string(),
494 source_domain: "docs.python.org".to_string(),
495 timestamp: Some(Utc::now()),
496 },
497 RawSearchResult {
498 title: "Random Blog".to_string(),
499 url: "https://random-blog.com/python".to_string(),
500 snippet: "Some random python content".to_string(),
501 source_domain: "random-blog.com".to_string(),
502 timestamp: Some(Utc::now()),
503 },
504 ];
505
506 let results = process_without_embeddings("python", &raw_results, &official_sources);
507
508 assert_eq!(results.len(), 2);
509 assert!(results[0].is_official); assert!(results[0].final_score > results[1].final_score);
511 }
512
513 #[test]
514 fn test_filter_quality_results() {
515 let results = vec![
516 ProcessedSearchResult {
517 title: "Good Result".to_string(),
518 url: "https://example.com".to_string(),
519 snippet: "This is a good quality result with sufficient content".to_string(),
520 source_domain: "example.com".to_string(),
521 is_official: false,
522 source_tier: 4,
523 similarity_score: 0.8,
524 final_score: 0.8,
525 timestamp: Some(Utc::now()),
526 },
527 ProcessedSearchResult {
528 title: "Short Result".to_string(),
529 url: "https://short.com".to_string(),
530 snippet: "Too short".to_string(),
531 source_domain: "short.com".to_string(),
532 is_official: false,
533 source_tier: 4,
534 similarity_score: 0.5,
535 final_score: 0.5,
536 timestamp: Some(Utc::now()),
537 },
538 ];
539
540 let filtered = filter_quality_results(results, 20);
541 assert_eq!(filtered.len(), 1);
542 assert_eq!(filtered[0].title, "Good Result");
543 }
544
545 #[test]
546 fn test_title_similarity() {
547 let words1 = vec!["Python", "Documentation", "Guide"];
548 let words2 = vec![
549 "Python".to_string(),
550 "Docs".to_string(),
551 "Tutorial".to_string(),
552 ];
553
554 let similarity = title_similarity(&words1, &words2);
555 assert!(similarity > 0.0 && similarity <= 1.0);
556 }
557}