Skip to main content

manx_cli/web_search/
result_processor.rs

1//! Result processing with semantic embeddings and official source ranking
2//!
3//! This module processes raw search results by:
4//! - Using semantic embeddings for similarity scoring
5//! - Applying official source priority boosts
6//! - Ranking results by combined relevance + authority scores
7
8use crate::rag::embeddings::EmbeddingModel;
9use crate::web_search::official_sources::{OfficialSourceManager, SourceTier};
10use crate::web_search::{query_analyzer, ProcessedSearchResult, RawSearchResult};
11use anyhow::Result;
12use std::collections::HashSet;
13
14fn extract_key_phrase(query: &str) -> Option<String> {
15    let q = query.to_lowercase();
16    // Quoted phrase support: find first pair of quotes
17    if let Some(start) = q.find('"') {
18        if let Some(end_rel) = q[start + 1..].find('"') {
19            let end = start + 1 + end_rel;
20            let phrase = &q[start + 1..end];
21            if !phrase.trim().is_empty() {
22                return Some(phrase.trim().to_string());
23            }
24        }
25    }
26    // Otherwise, build a candidate from the first two content words
27    let stopwords: std::collections::HashSet<&str> = [
28        "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with", "using",
29        "is", "are", "be", "this", "that", "it", "from", "by", "into", "as", "about", "write",
30    ]
31    .into_iter()
32    .collect();
33    let content: Vec<&str> = q
34        .split_whitespace()
35        .filter(|w| !stopwords.contains(*w))
36        .collect();
37    if content.len() >= 2 {
38        Some(format!("{} {}", content[0], content[1]))
39    } else {
40        None
41    }
42}
43
44/// Process search results with semantic similarity and query analysis context
45pub async fn process_with_embeddings_and_analysis(
46    query_analysis: &query_analyzer::QueryAnalysis,
47    raw_results: &[RawSearchResult],
48    embedding_model: &EmbeddingModel,
49    official_sources: &OfficialSourceManager,
50    similarity_threshold: f32,
51) -> Result<Vec<ProcessedSearchResult>> {
52    log::info!(
53        "Processing {} results with semantic embeddings + query analysis (framework: {:?})",
54        raw_results.len(),
55        query_analysis.detected_frameworks.first().map(|f| &f.name)
56    );
57
58    // Use enhanced query for embedding generation (better context)
59    let embedding_query = &query_analysis.enhanced_query;
60    let query_embedding = embedding_model.embed_text(embedding_query).await?;
61    let key_phrase = extract_key_phrase(&query_analysis.original_query);
62
63    let mut processed_results = Vec::new();
64
65    for (index, result) in raw_results.iter().enumerate() {
66        // Create enhanced text for embedding with domain context
67        let mut combined_text = format!("{} {}", result.title, result.snippet);
68
69        // Add framework context to help embeddings understand domain
70        for framework in &query_analysis.detected_frameworks {
71            combined_text.push_str(&format!(" {}", framework.name));
72        }
73
74        // Add domain context keywords
75        for keyword in &query_analysis.domain_context.context_keywords {
76            combined_text.push_str(&format!(" {}", keyword));
77        }
78
79        // Generate result embedding with enhanced context
80        let result_embedding = match embedding_model.embed_text(&combined_text).await {
81            Ok(embedding) => embedding,
82            Err(e) => {
83                log::warn!("Failed to embed result {}: {}", index, e);
84                continue;
85            }
86        };
87
88        // Calculate semantic similarity
89        let mut similarity_score =
90            EmbeddingModel::cosine_similarity(&query_embedding, &result_embedding);
91
92        // Phrase priority: boost if key phrase is present, penalize if absent for example-type queries
93        if let Some(ref phrase) = key_phrase {
94            let haystack = combined_text.to_lowercase();
95            if haystack.contains(phrase) {
96                similarity_score *= 1.2; // modest boost for phrase match
97            } else if matches!(
98                query_analysis.query_type,
99                query_analyzer::QueryType::Example | query_analyzer::QueryType::HowTo
100            ) {
101                similarity_score *= 0.85; // mild penalty if example-ish query lacks phrase
102            }
103            similarity_score = similarity_score.min(1.0);
104        }
105
106        // Apply framework-specific similarity threshold adjustment
107        let adjusted_threshold = if !query_analysis.detected_frameworks.is_empty() {
108            // Lower threshold for framework-specific queries (more lenient)
109            similarity_threshold * 0.8
110        } else {
111            similarity_threshold
112        };
113
114        // Skip results below adjusted threshold
115        if similarity_score < adjusted_threshold {
116            log::debug!(
117                "Filtering out result with low similarity: {} (score: {:.3}, threshold: {:.3})",
118                result.title,
119                similarity_score,
120                adjusted_threshold
121            );
122            continue;
123        }
124
125        // Determine source tier and official status
126        let source_tier = official_sources.get_source_tier(&result.source_domain, &result.url);
127        let is_official = matches!(
128            source_tier,
129            SourceTier::OfficialDocs | SourceTier::OfficialRepos
130        );
131
132        // Apply framework-specific boost
133        let mut source_boost = official_sources.get_score_boost(&source_tier);
134
135        // Extra boost if result matches detected framework domains
136        for framework in &query_analysis.detected_frameworks {
137            if framework
138                .official_sites
139                .iter()
140                .any(|site| result.source_domain.contains(site))
141            {
142                source_boost *= 1.5; // Extra framework match boost
143                log::debug!("Applied framework domain boost for {}", framework.name);
144            }
145        }
146
147        // Apply query type specific adjustments
148        let type_boost = match query_analysis.query_type {
149            query_analyzer::QueryType::Reference => 1.2, // Boost official documentation
150            query_analyzer::QueryType::Example => 1.1,   // Slightly boost examples
151            query_analyzer::QueryType::Troubleshoot => 0.9, // Allow more diverse sources
152            _ => 1.0,
153        };
154
155        // Calculate final score with all boosts
156        let final_score = similarity_score * source_boost * type_boost;
157
158        processed_results.push(ProcessedSearchResult {
159            title: result.title.clone(),
160            url: result.url.clone(),
161            snippet: result.snippet.clone(),
162            source_domain: result.source_domain.clone(),
163            is_official,
164            source_tier: source_tier as u8,
165            similarity_score,
166            final_score,
167            timestamp: result.timestamp,
168        });
169
170        log::debug!(
171            "Enhanced result: {} | Similarity: {:.3} | Source boost: {:.1}x | Type boost: {:.1}x | Final: {:.3}",
172            result.source_domain,
173            similarity_score,
174            source_boost,
175            type_boost,
176            final_score
177        );
178    }
179
180    // Sort by final score (descending)
181    processed_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
182
183    log::info!(
184        "Enhanced processing: {} relevant results (filtered {} below threshold)",
185        processed_results.len(),
186        raw_results.len() - processed_results.len()
187    );
188
189    Ok(processed_results)
190}
191
192/// Filter out non-technical domains when processing technical queries (LLM-enhanced only)
193pub fn filter_non_technical_domains(
194    results: Vec<ProcessedSearchResult>,
195    query_analysis: &query_analyzer::QueryAnalysis,
196    has_llm: bool,
197) -> Vec<ProcessedSearchResult> {
198    if !has_llm || query_analysis.detected_frameworks.is_empty() {
199        // No filtering without LLM or if not a framework-specific query
200        return results;
201    }
202
203    // Non-technical domains to filter out for technical queries
204    let non_technical_domains = vec![
205        "amazon.com",
206        "ebay.com",
207        "etsy.com",
208        "walmart.com",
209        "target.com",
210        "houzz.com",
211        "wayfair.com",
212        "overstock.com",
213        "perigold.com",
214        "safavieh.com",
215        "furniture.com",
216        "shopping.com",
217        "bestbuy.com",
218        "lowes.com",
219        "homedepot.com",
220    ];
221
222    let original_count = results.len();
223    let filtered_results: Vec<ProcessedSearchResult> = results
224        .into_iter()
225        .filter(|result| {
226            let domain_lower = result.source_domain.to_lowercase();
227
228            // Check if it's a non-technical domain
229            let is_non_technical = non_technical_domains
230                .iter()
231                .any(|nt_domain| domain_lower.contains(nt_domain));
232
233            if is_non_technical {
234                log::debug!(
235                    "LLM filter: Removing non-technical result: {} from {}",
236                    result.title,
237                    result.source_domain
238                );
239                false
240            } else {
241                true
242            }
243        })
244        .collect();
245
246    let filtered_count = original_count - filtered_results.len();
247    if filtered_count > 0 {
248        log::info!(
249            "🧠 LLM-enhanced filtering: Removed {} non-technical results (e.g., shopping, furniture)",
250            filtered_count
251        );
252    }
253
254    filtered_results
255}
256
257/// Process search results without embeddings (fallback method)
258pub fn process_without_embeddings(
259    query: &str,
260    raw_results: &[RawSearchResult],
261    official_sources: &OfficialSourceManager,
262) -> Vec<ProcessedSearchResult> {
263    log::info!(
264        "Processing {} results with text matching (no embeddings)",
265        raw_results.len()
266    );
267
268    let query_lower = query.to_lowercase();
269    // Basic stopword filtering to avoid inflating scores with common words
270    let stopwords: HashSet<&str> = [
271        "a", "an", "and", "the", "in", "on", "of", "to", "for", "how", "do", "i", "with", "using",
272        "is", "are", "be", "this", "that", "it", "from", "by", "into", "as",
273    ]
274    .into_iter()
275    .collect();
276
277    let mut query_words: Vec<&str> = query_lower
278        .split_whitespace()
279        .filter(|w| !stopwords.contains(*w))
280        .collect();
281    query_words.dedup();
282    let key_phrase = extract_key_phrase(query);
283    let mut processed_results = Vec::new();
284
285    for result in raw_results {
286        // Simple text-based similarity scoring
287        let combined_text = format!("{} {}", result.title, result.snippet).to_lowercase();
288
289        // Count query word matches
290        let word_matches = query_words
291            .iter()
292            .filter(|word| combined_text.contains(*word))
293            .count();
294
295        // Calculate basic similarity score (percentage of query words found)
296        let mut similarity_score = if query_words.is_empty() {
297            0.3 // Lower default to reduce false positives
298        } else {
299            word_matches as f32 / query_words.len() as f32
300        };
301
302        // Phrase priority/penalty
303        if let Some(ref phrase) = key_phrase {
304            if combined_text.contains(phrase) {
305                similarity_score = (similarity_score + 0.2).min(1.0);
306            } else {
307                similarity_score = (similarity_score - 0.1).max(0.0);
308            }
309        }
310
311        // Apply minimum threshold
312        if similarity_score < 0.3 {
313            continue;
314        }
315
316        // Determine source tier and official status
317        let source_tier = official_sources.get_source_tier(&result.source_domain, &result.url);
318        let is_official = matches!(
319            source_tier,
320            SourceTier::OfficialDocs | SourceTier::OfficialRepos
321        );
322
323        // Calculate official source boost
324        let source_boost = official_sources.get_score_boost(&source_tier);
325
326        // Calculate final score
327        let final_score = similarity_score * source_boost;
328
329        processed_results.push(ProcessedSearchResult {
330            title: result.title.clone(),
331            url: result.url.clone(),
332            snippet: result.snippet.clone(),
333            source_domain: result.source_domain.clone(),
334            is_official,
335            source_tier: source_tier as u8,
336            similarity_score,
337            final_score,
338            timestamp: result.timestamp,
339        });
340    }
341
342    // Sort by final score (descending)
343    processed_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
344
345    log::info!(
346        "Processed {} results with text matching",
347        processed_results.len()
348    );
349    processed_results
350}
351
352/// Enhance results with additional metadata
353pub fn enhance_results(
354    processed_results: &mut [ProcessedSearchResult],
355    _official_sources: &OfficialSourceManager,
356) {
357    for result in processed_results.iter_mut() {
358        // Add content type detection
359        if result.url.contains("/docs/") || result.url.contains("/documentation/") {
360            // This is likely documentation
361        } else if result.url.contains("/api/") {
362            // This is likely API documentation
363        } else if result.url.contains("/tutorial") || result.url.contains("/guide") {
364            // This is likely a tutorial or guide
365        }
366
367        // Boost results that mention exact query terms in title
368        // This could be implemented for better ranking
369    }
370}
371
372/// Filter results to remove low-quality or duplicate content
373pub fn filter_quality_results(
374    processed_results: Vec<ProcessedSearchResult>,
375    min_snippet_length: usize,
376) -> Vec<ProcessedSearchResult> {
377    processed_results
378        .into_iter()
379        .filter(|result| {
380            // Filter out results with very short snippets
381            if result.snippet.len() < min_snippet_length {
382                log::debug!("Filtering short snippet: {}", result.title);
383                return false;
384            }
385
386            // Filter out obvious spam or low-quality indicators
387            let snippet_lower = result.snippet.to_lowercase();
388            if snippet_lower.contains("lorem ipsum")
389                || snippet_lower.contains("click here for more")
390                || snippet_lower.contains("subscribe now")
391            {
392                log::debug!("Filtering low-quality content: {}", result.title);
393                return false;
394            }
395
396            // Filter out results that are just lists of links
397            if result.snippet.matches("http").count() > 3 {
398                log::debug!("Filtering link-heavy content: {}", result.title);
399                return false;
400            }
401
402            true
403        })
404        .collect()
405}
406
407/// Deduplicate results based on content similarity
408pub fn deduplicate_results(
409    mut processed_results: Vec<ProcessedSearchResult>,
410) -> Vec<ProcessedSearchResult> {
411    // Simple deduplication based on URL domain + title similarity
412    processed_results.sort_by(|a, b| {
413        let domain_cmp = a.source_domain.cmp(&b.source_domain);
414        if domain_cmp == std::cmp::Ordering::Equal {
415            a.title.cmp(&b.title)
416        } else {
417            domain_cmp
418        }
419    });
420
421    let mut unique_results = Vec::new();
422    let mut last_domain = String::new();
423    let mut last_title_words = Vec::new();
424
425    let result_count = processed_results.len();
426    for result in &processed_results {
427        let current_title_words: Vec<&str> = result.title.split_whitespace().take(5).collect();
428
429        // Check if this is a duplicate based on domain + title similarity
430        let is_duplicate = result.source_domain == last_domain
431            && title_similarity(&current_title_words, &last_title_words) > 0.8;
432
433        if !is_duplicate {
434            unique_results.push(result.clone());
435        } else {
436            log::debug!(
437                "Removing duplicate: {} from {}",
438                result.title,
439                result.source_domain
440            );
441        }
442
443        last_domain = result.source_domain.clone();
444        last_title_words = current_title_words
445            .into_iter()
446            .map(|s| s.to_string())
447            .collect();
448    }
449
450    // Re-sort by final score
451    unique_results.sort_by(|a, b| b.final_score.partial_cmp(&a.final_score).unwrap());
452
453    log::info!(
454        "Deduplicated results: {} -> {}",
455        result_count,
456        unique_results.len()
457    );
458
459    unique_results
460}
461
462/// Calculate title similarity for deduplication
463fn title_similarity(words1: &[&str], words2: &[String]) -> f32 {
464    if words1.is_empty() || words2.is_empty() {
465        return 0.0;
466    }
467
468    let matches = words1
469        .iter()
470        .filter(|word1| {
471            words2
472                .iter()
473                .any(|word2| word1.to_lowercase() == word2.to_lowercase())
474        })
475        .count();
476
477    matches as f32 / words1.len().max(words2.len()) as f32
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483    use chrono::Utc;
484
485    #[test]
486    fn test_process_without_embeddings() {
487        let official_sources = OfficialSourceManager::new();
488
489        let raw_results = vec![
490            RawSearchResult {
491                title: "Python Documentation".to_string(),
492                url: "https://docs.python.org/3/".to_string(),
493                snippet: "Python programming language documentation".to_string(),
494                source_domain: "docs.python.org".to_string(),
495                timestamp: Some(Utc::now()),
496            },
497            RawSearchResult {
498                title: "Random Blog".to_string(),
499                url: "https://random-blog.com/python".to_string(),
500                snippet: "Some random python content".to_string(),
501                source_domain: "random-blog.com".to_string(),
502                timestamp: Some(Utc::now()),
503            },
504        ];
505
506        let results = process_without_embeddings("python", &raw_results, &official_sources);
507
508        assert_eq!(results.len(), 2);
509        assert!(results[0].is_official); // Official source should rank higher
510        assert!(results[0].final_score > results[1].final_score);
511    }
512
513    #[test]
514    fn test_filter_quality_results() {
515        let results = vec![
516            ProcessedSearchResult {
517                title: "Good Result".to_string(),
518                url: "https://example.com".to_string(),
519                snippet: "This is a good quality result with sufficient content".to_string(),
520                source_domain: "example.com".to_string(),
521                is_official: false,
522                source_tier: 4,
523                similarity_score: 0.8,
524                final_score: 0.8,
525                timestamp: Some(Utc::now()),
526            },
527            ProcessedSearchResult {
528                title: "Short Result".to_string(),
529                url: "https://short.com".to_string(),
530                snippet: "Too short".to_string(),
531                source_domain: "short.com".to_string(),
532                is_official: false,
533                source_tier: 4,
534                similarity_score: 0.5,
535                final_score: 0.5,
536                timestamp: Some(Utc::now()),
537            },
538        ];
539
540        let filtered = filter_quality_results(results, 20);
541        assert_eq!(filtered.len(), 1);
542        assert_eq!(filtered[0].title, "Good Result");
543    }
544
545    #[test]
546    fn test_title_similarity() {
547        let words1 = vec!["Python", "Documentation", "Guide"];
548        let words2 = vec![
549            "Python".to_string(),
550            "Docs".to_string(),
551            "Tutorial".to_string(),
552        ];
553
554        let similarity = title_similarity(&words1, &words2);
555        assert!(similarity > 0.0 && similarity <= 1.0);
556    }
557}