agentroot_core/search/
snippet.rs

1//! Snippet extraction for search results
2
3/// Extracted snippet with metadata
4#[derive(Debug, Clone)]
5pub struct Snippet {
6    pub snippet: String,
7    pub start_pos: usize,
8    pub end_pos: usize,
9}
10
11/// Extract a relevant snippet from content
12pub fn extract_snippet(
13    content: &str,
14    query: &str,
15    max_length: Option<usize>,
16    chunk_pos: Option<usize>,
17) -> Snippet {
18    let max_len = max_length.unwrap_or(500);
19
20    // If content is short enough, return it all
21    if content.len() <= max_len {
22        return Snippet {
23            snippet: content.to_string(),
24            start_pos: 0,
25            end_pos: content.len(),
26        };
27    }
28
29    // Start from chunk position if available
30    let center = chunk_pos.unwrap_or_else(|| find_query_position(content, query));
31
32    // Calculate window
33    let half_len = max_len / 2;
34    let start = center.saturating_sub(half_len);
35    let end = (start + max_len).min(content.len());
36    let start = if end == content.len() {
37        end.saturating_sub(max_len)
38    } else {
39        start
40    };
41
42    // Adjust to word boundaries
43    let (start, end) = adjust_to_word_boundaries(content, start, end);
44
45    let mut snippet = content[start..end].to_string();
46
47    // Add ellipsis
48    if start > 0 {
49        snippet = format!("...{}", snippet.trim_start());
50    }
51    if end < content.len() {
52        snippet = format!("{}...", snippet.trim_end());
53    }
54
55    Snippet {
56        snippet,
57        start_pos: start,
58        end_pos: end,
59    }
60}
61
62/// Find the position of query terms in content
63fn find_query_position(content: &str, query: &str) -> usize {
64    let content_lower = content.to_lowercase();
65    let query_lower = query.to_lowercase();
66
67    // Try to find exact match first
68    if let Some(pos) = content_lower.find(&query_lower) {
69        return pos;
70    }
71
72    // Try individual terms
73    let terms: Vec<&str> = query_lower
74        .split_whitespace()
75        .filter(|t| t.len() >= 3)
76        .collect();
77
78    for term in terms {
79        if let Some(pos) = content_lower.find(term) {
80            return pos;
81        }
82    }
83
84    // Default to start
85    0
86}
87
88/// Adjust positions to word boundaries
89fn adjust_to_word_boundaries(content: &str, start: usize, end: usize) -> (usize, usize) {
90    let bytes = content.as_bytes();
91
92    // Find start of word
93    let mut new_start = start;
94    while new_start > 0
95        && bytes
96            .get(new_start - 1)
97            .map(|&b| !b.is_ascii_whitespace())
98            .unwrap_or(false)
99    {
100        new_start -= 1;
101    }
102
103    // Find end of word
104    let mut new_end = end;
105    while new_end < bytes.len()
106        && bytes
107            .get(new_end)
108            .map(|&b| !b.is_ascii_whitespace())
109            .unwrap_or(false)
110    {
111        new_end += 1;
112    }
113
114    (new_start, new_end)
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    #[test]
122    fn test_short_content() {
123        let snippet = extract_snippet("Hello world", "hello", None, None);
124        assert_eq!(snippet.snippet, "Hello world");
125    }
126
127    #[test]
128    fn test_long_content() {
129        let content = "a ".repeat(500);
130        let snippet = extract_snippet(&content, "test", Some(100), None);
131        assert!(snippet.snippet.len() <= 110); // Allow for ellipsis
132    }
133}