Skip to main content

nodedb_fts/
highlight.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Highlight and match offset utilities for search results.
4
5use crate::analyzer::pipeline::analyze;
6use crate::backend::FtsBackend;
7use crate::index::FtsIndex;
8use crate::posting::MatchOffset;
9
10impl<B: FtsBackend> FtsIndex<B> {
11    /// Generate highlighted text with matched query terms wrapped in tags.
12    ///
13    /// Returns the original text with each occurrence of a matched query term
14    /// surrounded by `prefix` and `suffix` (e.g., `<b>` and `</b>`).
15    pub fn highlight(&self, text: &str, query: &str, prefix: &str, suffix: &str) -> String {
16        let matches = self.find_query_matches(text, query);
17        if matches.is_empty() {
18            return text.to_string();
19        }
20
21        let mut result =
22            String::with_capacity(text.len() + matches.len() * (prefix.len() + suffix.len()) * 2);
23        let mut last_end = 0;
24
25        for m in &matches {
26            result.push_str(&text[last_end..m.start]);
27            result.push_str(prefix);
28            result.push_str(&text[m.start..m.end]);
29            result.push_str(suffix);
30            last_end = m.end;
31        }
32        result.push_str(&text[last_end..]);
33        result
34    }
35
36    /// Return byte offsets of matched query terms in the original text.
37    pub fn offsets(&self, text: &str, query: &str) -> Vec<MatchOffset> {
38        self.find_query_matches(text, query)
39    }
40
41    /// Find all query term matches in `text`, returning byte offsets and stemmed terms.
42    fn find_query_matches(&self, text: &str, query: &str) -> Vec<MatchOffset> {
43        let query_tokens = analyze(query);
44        if query_tokens.is_empty() {
45            return Vec::new();
46        }
47
48        let query_set: std::collections::HashSet<&str> =
49            query_tokens.iter().map(String::as_str).collect();
50        let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
51
52        let mut matches = Vec::new();
53        for (start, word) in WordBoundaryIter::new(text) {
54            let lower = word.to_lowercase();
55            let stemmed = stemmer.stem(&lower);
56            if query_set.contains(stemmed.as_ref()) {
57                matches.push(MatchOffset {
58                    start,
59                    end: start + word.len(),
60                    term: stemmed.into_owned(),
61                });
62            }
63        }
64        matches
65    }
66}
67
68/// Iterator over word boundaries in text, yielding `(byte_offset, &str)` pairs.
69///
70/// Words are sequences of alphanumeric chars, hyphens, and underscores.
71struct WordBoundaryIter<'a> {
72    text: &'a str,
73    pos: usize,
74}
75
76impl<'a> WordBoundaryIter<'a> {
77    fn new(text: &'a str) -> Self {
78        Self { text, pos: 0 }
79    }
80}
81
82impl<'a> Iterator for WordBoundaryIter<'a> {
83    type Item = (usize, &'a str);
84
85    fn next(&mut self) -> Option<Self::Item> {
86        let bytes = self.text.as_bytes();
87        // Skip non-word characters.
88        while self.pos < bytes.len() {
89            let c = self.text[self.pos..].chars().next()?;
90            if c.is_alphanumeric() || c == '-' || c == '_' {
91                break;
92            }
93            self.pos += c.len_utf8();
94        }
95        if self.pos >= bytes.len() {
96            return None;
97        }
98
99        let start = self.pos;
100        // Consume word characters.
101        while self.pos < bytes.len() {
102            let Some(c) = self.text[self.pos..].chars().next() else {
103                break;
104            };
105            if c.is_alphanumeric() || c == '-' || c == '_' {
106                self.pos += c.len_utf8();
107            } else {
108                break;
109            }
110        }
111        let word = &self.text[start..self.pos];
112        let trimmed_start = start + word.len() - word.trim_start_matches(['-', '_']).len();
113        let trimmed_end = trimmed_start + word.trim_matches(['-', '_']).len();
114        if trimmed_start >= trimmed_end {
115            return self.next();
116        }
117        Some((trimmed_start, &self.text[trimmed_start..trimmed_end]))
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use crate::backend::memory::MemoryBackend;
124    use crate::index::FtsIndex;
125
126    #[test]
127    fn highlight_basic() {
128        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
129        let result = idx.highlight(
130            "The quick brown fox jumps over the lazy dog",
131            "brown fox",
132            "<b>",
133            "</b>",
134        );
135        assert!(result.contains("<b>brown</b>"));
136        assert!(result.contains("<b>fox</b>"));
137    }
138
139    #[test]
140    fn highlight_no_match() {
141        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
142        let text = "hello world";
143        let result = idx.highlight(text, "xyz", "<b>", "</b>");
144        assert_eq!(result, text);
145    }
146
147    #[test]
148    fn offsets_basic() {
149        let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
150        let offsets = idx.offsets("The quick brown fox", "brown");
151        assert_eq!(offsets.len(), 1);
152        assert_eq!(offsets[0].start, 10);
153        assert_eq!(offsets[0].end, 15);
154    }
155
156    #[test]
157    fn word_boundary_iter() {
158        let iter = super::WordBoundaryIter::new("hello, world! foo-bar");
159        let words: Vec<_> = iter.collect();
160        assert!(words.iter().any(|(_, w)| *w == "hello"));
161        assert!(words.iter().any(|(_, w)| *w == "world"));
162        assert!(words.iter().any(|(_, w)| *w == "foo-bar"));
163    }
164}