#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Snippet {
pub text: String,
pub highlight_ranges: Vec<std::ops::Range<usize>>,
pub truncated: bool,
}
pub const MAX_SNIPPET_CHARS: usize = 240;
pub const SNIPPET_LEAD_CHARS: usize = 40;
pub fn extract(claim: &str, query_terms: &[&str]) -> Snippet {
let claim_lower = claim.to_lowercase();
let first_match_byte = query_terms
.iter()
.filter(|term| !term.is_empty())
.filter_map(|term| {
let term_lower = term.to_lowercase();
claim_lower.find(term_lower.as_str())
})
.min();
let window_start_byte = match first_match_byte {
None => 0,
Some(match_byte) => {
let match_char_idx = claim[..match_byte].chars().count();
let lead_start_char = match_char_idx.saturating_sub(SNIPPET_LEAD_CHARS);
claim
.char_indices()
.nth(lead_start_char)
.map(|(b, _)| b)
.unwrap_or(0)
}
};
let window_slice = &claim[window_start_byte..];
let window_char_count = window_slice.chars().count();
let truncated_from_end = window_char_count > MAX_SNIPPET_CHARS;
let window_text: String = if truncated_from_end {
window_slice.chars().take(MAX_SNIPPET_CHARS).collect()
} else {
window_slice.to_string()
};
let truncated = window_start_byte > 0 || truncated_from_end;
let window_lower = window_text.to_lowercase();
let mut highlight_ranges: Vec<std::ops::Range<usize>> = Vec::new();
for term in query_terms {
if term.is_empty() {
continue;
}
let term_lower = term.to_lowercase();
let term_bytes = term_lower.len();
if term_bytes == 0 {
continue;
}
let mut search_start = 0usize;
while search_start <= window_lower.len().saturating_sub(term_bytes) {
match window_lower[search_start..].find(term_lower.as_str()) {
None => break,
Some(rel_offset) => {
let abs_start = search_start + rel_offset;
let abs_end = abs_start + term_bytes;
highlight_ranges.push(abs_start..abs_end);
search_start = abs_start + term_bytes.max(1);
}
}
}
}
highlight_ranges.sort_by_key(|r| r.start);
highlight_ranges.dedup_by(|later, earlier| {
if later.start < earlier.end {
earlier.end = earlier.end.max(later.end);
true
} else {
false
}
});
Snippet {
text: window_text,
highlight_ranges,
truncated,
}
}
pub fn tokenize_query(query: &str) -> Vec<String> {
let mut seen: Vec<String> = Vec::new();
for token in query
.split(|c: char| !c.is_alphanumeric())
.map(str::trim)
.filter(|t| t.len() >= 2)
.map(|t| t.to_lowercase())
{
if !seen.contains(&token) {
seen.push(token);
}
}
seen
}
pub fn snippet_plain_text(snippet: &Snippet) -> String {
if snippet.truncated {
format!("[…]{}", snippet.text)
} else {
snippet.text.clone()
}
}
const ANSI_BOLD_ON: &str = "\x1b[1m";
const ANSI_UNDERLINE_ON: &str = "\x1b[4m";
const ANSI_RESET: &str = "\x1b[0m";
pub fn snippet_ansi_highlighted(snippet: &Snippet) -> String {
if snippet.highlight_ranges.is_empty() {
return snippet.text.clone();
}
let text = &snippet.text;
let bytes = text.as_bytes();
let mut result = String::with_capacity(text.len() + snippet.highlight_ranges.len() * 20);
let mut cursor = 0usize;
for range in &snippet.highlight_ranges {
if range.start > cursor {
result.push_str(&text[cursor..range.start]);
}
result.push_str(ANSI_BOLD_ON);
result.push_str(ANSI_UNDERLINE_ON);
let end = range.end.min(bytes.len());
result.push_str(&text[range.start..end]);
result.push_str(ANSI_RESET);
cursor = end;
}
if cursor < text.len() {
result.push_str(&text[cursor..]);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_finds_first_match_and_centers_window() {
let padding = "word ".repeat(20); let claim = format!("{padding}the quick brown fox jumps over the lazy dog");
let snippet = extract(&claim, &["fox"]);
assert!(
snippet.text.contains("fox"),
"window must contain the matched term"
);
assert!(
!snippet.highlight_ranges.is_empty(),
"should have at least one highlight"
);
}
#[test]
fn extract_clamps_window_to_string_start_when_match_is_early() {
let claim = "fox is the very first word in this claim";
let snippet = extract(claim, &["fox"]);
assert!(snippet.text.starts_with("fox"));
assert!(!snippet.highlight_ranges.is_empty());
}
#[test]
fn extract_returns_truncated_true_when_claim_is_longer_than_max() {
let suffix = "x".repeat(300);
let claim = format!("needle {suffix}");
let snippet = extract(&claim, &["needle"]);
assert!(snippet.truncated);
assert!(snippet.text.len() <= claim.len());
assert!(snippet.text.chars().count() <= MAX_SNIPPET_CHARS);
}
#[test]
fn extract_marks_all_occurrences_of_all_terms_in_window() {
let claim = "rust and memory and rust again and memory everywhere";
let snippet = extract(claim, &["rust", "memory"]);
assert_eq!(snippet.highlight_ranges.len(), 4);
for range in &snippet.highlight_ranges {
let frag = &snippet.text[range.clone()];
assert!(
frag == "rust" || frag == "memory",
"unexpected fragment: {frag:?}"
);
}
}
#[test]
fn extract_case_insensitive_match() {
let claim = "The QUICK Brown Fox Jumps OVER the lazy dog";
let snippet = extract(claim, &["quick", "FOX"]);
assert!(!snippet.highlight_ranges.is_empty());
for range in &snippet.highlight_ranges {
let frag = snippet.text[range.clone()].to_lowercase();
assert!(frag == "quick" || frag == "fox");
}
}
#[test]
fn extract_handles_multi_byte_utf8_without_splitting() {
let claim = "🦀 Rust is memory-safe 🦀 and fast 🦀 by design";
let snippet = extract(claim, &["rust"]);
assert!(std::str::from_utf8(snippet.text.as_bytes()).is_ok());
assert!(!snippet.highlight_ranges.is_empty());
}
#[test]
fn extract_no_match_returns_first_window_with_empty_highlights() {
let claim = "no matching term anywhere in this sentence at all";
let snippet = extract(claim, &["zzz_absent"]);
assert!(snippet.highlight_ranges.is_empty());
assert_eq!(snippet.text, claim);
assert!(!snippet.truncated);
}
#[test]
fn tokenize_query_lowercases_and_deduplicates() {
let terms = tokenize_query("Rust RUST rust Memory memory");
assert_eq!(terms, vec!["rust", "memory"]);
}
#[test]
fn tokenize_query_filters_short_terms() {
let terms = tokenize_query("a to the memory");
assert!(terms.contains(&"memory".to_string()));
assert!(terms.contains(&"to".to_string()));
assert!(!terms.contains(&"a".to_string()));
}
#[test]
fn snippet_plain_text_prepends_ellipsis_when_truncated() {
let snippet = Snippet {
text: "some window text".to_string(),
highlight_ranges: vec![],
truncated: true,
};
let plain = snippet_plain_text(&snippet);
assert!(plain.starts_with("[…]"));
assert!(plain.ends_with("some window text"));
}
#[test]
fn snippet_plain_text_no_prefix_when_not_truncated() {
let snippet = Snippet {
text: "full text".to_string(),
highlight_ranges: vec![],
truncated: false,
};
let plain = snippet_plain_text(&snippet);
assert_eq!(plain, "full text");
}
#[test]
fn snippet_ansi_highlighted_wraps_ranges() {
let claim = "the quick brown fox jumps over the lazy dog";
let snippet = extract(claim, &["fox"]);
let ansi = snippet_ansi_highlighted(&snippet);
assert!(
ansi.contains(ANSI_BOLD_ON),
"ANSI bold escape must be present"
);
assert!(
ansi.contains(ANSI_UNDERLINE_ON),
"ANSI underline escape must be present"
);
assert!(
ansi.contains(ANSI_RESET),
"ANSI reset escape must be present"
);
assert!(ansi.contains("fox"));
let bold_pos = ansi.find(ANSI_BOLD_ON).unwrap();
let fox_pos = ansi.find("fox").unwrap();
assert!(
bold_pos < fox_pos,
"bold escape should precede the match text"
);
}
}