#[derive(Debug, Clone)]
pub struct Snippet {
pub snippet: String,
pub start_pos: usize,
pub end_pos: usize,
}
pub fn extract_snippet(
content: &str,
query: &str,
max_length: Option<usize>,
chunk_pos: Option<usize>,
) -> Snippet {
let max_len = max_length.unwrap_or(500);
if content.len() <= max_len {
return Snippet {
snippet: content.to_string(),
start_pos: 0,
end_pos: content.len(),
};
}
let center = chunk_pos.unwrap_or_else(|| find_query_position(content, query));
let half_len = max_len / 2;
let start = center.saturating_sub(half_len);
let end = (start + max_len).min(content.len());
let start = if end == content.len() {
end.saturating_sub(max_len)
} else {
start
};
let (start, end) = adjust_to_word_boundaries(content, start, end);
let mut snippet = content[start..end].to_string();
if start > 0 {
snippet = format!("...{}", snippet.trim_start());
}
if end < content.len() {
snippet = format!("{}...", snippet.trim_end());
}
Snippet {
snippet,
start_pos: start,
end_pos: end,
}
}
fn find_query_position(content: &str, query: &str) -> usize {
let content_lower = content.to_lowercase();
let query_lower = query.to_lowercase();
if let Some(pos) = content_lower.find(&query_lower) {
return pos;
}
let terms: Vec<&str> = query_lower
.split_whitespace()
.filter(|t| t.len() >= 3)
.collect();
for term in terms {
if let Some(pos) = content_lower.find(term) {
return pos;
}
}
0
}
fn adjust_to_word_boundaries(content: &str, start: usize, end: usize) -> (usize, usize) {
let bytes = content.as_bytes();
let mut new_start = start;
while new_start > 0
&& bytes
.get(new_start - 1)
.map(|&b| !b.is_ascii_whitespace())
.unwrap_or(false)
{
new_start -= 1;
}
let mut new_end = end;
while new_end < bytes.len()
&& bytes
.get(new_end)
.map(|&b| !b.is_ascii_whitespace())
.unwrap_or(false)
{
new_end += 1;
}
(new_start, new_end)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_short_content() {
let snippet = extract_snippet("Hello world", "hello", None, None);
assert_eq!(snippet.snippet, "Hello world");
}
#[test]
fn test_long_content() {
let content = "a ".repeat(500);
let snippet = extract_snippet(&content, "test", Some(100), None);
assert!(snippet.snippet.len() <= 110); }
}