Skip to main content

nodedb_fts/
highlight.rs

1//! Highlight and match offset utilities for search results.
2
3use crate::analyzer::pipeline::analyze;
4use crate::backend::FtsBackend;
5use crate::index::FtsIndex;
6use crate::posting::MatchOffset;
7
8impl<B: FtsBackend> FtsIndex<B> {
9    /// Generate highlighted text with matched query terms wrapped in tags.
10    ///
11    /// Returns the original text with each occurrence of a matched query term
12    /// surrounded by `prefix` and `suffix` (e.g., `<b>` and `</b>`).
13    pub fn highlight(&self, text: &str, query: &str, prefix: &str, suffix: &str) -> String {
14        let matches = self.find_query_matches(text, query);
15        if matches.is_empty() {
16            return text.to_string();
17        }
18
19        let mut result =
20            String::with_capacity(text.len() + matches.len() * (prefix.len() + suffix.len()) * 2);
21        let mut last_end = 0;
22
23        for m in &matches {
24            result.push_str(&text[last_end..m.start]);
25            result.push_str(prefix);
26            result.push_str(&text[m.start..m.end]);
27            result.push_str(suffix);
28            last_end = m.end;
29        }
30        result.push_str(&text[last_end..]);
31        result
32    }
33
34    /// Return byte offsets of matched query terms in the original text.
35    pub fn offsets(&self, text: &str, query: &str) -> Vec<MatchOffset> {
36        self.find_query_matches(text, query)
37    }
38
39    /// Find all query term matches in `text`, returning byte offsets and stemmed terms.
40    fn find_query_matches(&self, text: &str, query: &str) -> Vec<MatchOffset> {
41        let query_tokens = analyze(query);
42        if query_tokens.is_empty() {
43            return Vec::new();
44        }
45
46        let query_set: std::collections::HashSet<&str> =
47            query_tokens.iter().map(String::as_str).collect();
48        let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
49
50        let mut matches = Vec::new();
51        for (start, word) in WordBoundaryIter::new(text) {
52            let lower = word.to_lowercase();
53            let stemmed = stemmer.stem(&lower);
54            if query_set.contains(stemmed.as_ref()) {
55                matches.push(MatchOffset {
56                    start,
57                    end: start + word.len(),
58                    term: stemmed.into_owned(),
59                });
60            }
61        }
62        matches
63    }
64}
65
66/// Iterator over word boundaries in text, yielding `(byte_offset, &str)` pairs.
67///
68/// Words are sequences of alphanumeric chars, hyphens, and underscores.
69struct WordBoundaryIter<'a> {
70    text: &'a str,
71    pos: usize,
72}
73
74impl<'a> WordBoundaryIter<'a> {
75    fn new(text: &'a str) -> Self {
76        Self { text, pos: 0 }
77    }
78}
79
80impl<'a> Iterator for WordBoundaryIter<'a> {
81    type Item = (usize, &'a str);
82
83    fn next(&mut self) -> Option<Self::Item> {
84        let bytes = self.text.as_bytes();
85        // Skip non-word characters.
86        while self.pos < bytes.len() {
87            let c = self.text[self.pos..].chars().next()?;
88            if c.is_alphanumeric() || c == '-' || c == '_' {
89                break;
90            }
91            self.pos += c.len_utf8();
92        }
93        if self.pos >= bytes.len() {
94            return None;
95        }
96
97        let start = self.pos;
98        // Consume word characters.
99        while self.pos < bytes.len() {
100            let Some(c) = self.text[self.pos..].chars().next() else {
101                break;
102            };
103            if c.is_alphanumeric() || c == '-' || c == '_' {
104                self.pos += c.len_utf8();
105            } else {
106                break;
107            }
108        }
109        let word = &self.text[start..self.pos];
110        let trimmed_start = start + word.len() - word.trim_start_matches(['-', '_']).len();
111        let trimmed_end = trimmed_start + word.trim_matches(['-', '_']).len();
112        if trimmed_start >= trimmed_end {
113            return self.next();
114        }
115        Some((trimmed_start, &self.text[trimmed_start..trimmed_end]))
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use crate::backend::memory::MemoryBackend;
122    use crate::index::FtsIndex;
123
124    #[test]
125    fn highlight_basic() {
126        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
127        let result = idx.highlight(
128            "The quick brown fox jumps over the lazy dog",
129            "brown fox",
130            "<b>",
131            "</b>",
132        );
133        assert!(result.contains("<b>brown</b>"));
134        assert!(result.contains("<b>fox</b>"));
135    }
136
137    #[test]
138    fn highlight_no_match() {
139        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
140        let text = "hello world";
141        let result = idx.highlight(text, "xyz", "<b>", "</b>");
142        assert_eq!(result, text);
143    }
144
145    #[test]
146    fn offsets_basic() {
147        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
148        let offsets = idx.offsets("The quick brown fox", "brown");
149        assert_eq!(offsets.len(), 1);
150        assert_eq!(offsets[0].start, 10);
151        assert_eq!(offsets[0].end, 15);
152    }
153
154    #[test]
155    fn word_boundary_iter() {
156        let iter = super::WordBoundaryIter::new("hello, world! foo-bar");
157        let words: Vec<_> = iter.collect();
158        assert!(words.iter().any(|(_, w)| *w == "hello"));
159        assert!(words.iter().any(|(_, w)| *w == "world"));
160        assert!(words.iter().any(|(_, w)| *w == "foo-bar"));
161    }
162}