1use crate::analyzer::pipeline::analyze;
4use crate::backend::FtsBackend;
5use crate::index::FtsIndex;
6use crate::posting::MatchOffset;
7
8impl<B: FtsBackend> FtsIndex<B> {
9 pub fn highlight(&self, text: &str, query: &str, prefix: &str, suffix: &str) -> String {
14 let matches = self.find_query_matches(text, query);
15 if matches.is_empty() {
16 return text.to_string();
17 }
18
19 let mut result =
20 String::with_capacity(text.len() + matches.len() * (prefix.len() + suffix.len()) * 2);
21 let mut last_end = 0;
22
23 for m in &matches {
24 result.push_str(&text[last_end..m.start]);
25 result.push_str(prefix);
26 result.push_str(&text[m.start..m.end]);
27 result.push_str(suffix);
28 last_end = m.end;
29 }
30 result.push_str(&text[last_end..]);
31 result
32 }
33
34 pub fn offsets(&self, text: &str, query: &str) -> Vec<MatchOffset> {
36 self.find_query_matches(text, query)
37 }
38
39 fn find_query_matches(&self, text: &str, query: &str) -> Vec<MatchOffset> {
41 let query_tokens = analyze(query);
42 if query_tokens.is_empty() {
43 return Vec::new();
44 }
45
46 let query_set: std::collections::HashSet<&str> =
47 query_tokens.iter().map(String::as_str).collect();
48 let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
49
50 let mut matches = Vec::new();
51 for (start, word) in WordBoundaryIter::new(text) {
52 let lower = word.to_lowercase();
53 let stemmed = stemmer.stem(&lower);
54 if query_set.contains(stemmed.as_ref()) {
55 matches.push(MatchOffset {
56 start,
57 end: start + word.len(),
58 term: stemmed.into_owned(),
59 });
60 }
61 }
62 matches
63 }
64}
65
66struct WordBoundaryIter<'a> {
70 text: &'a str,
71 pos: usize,
72}
73
74impl<'a> WordBoundaryIter<'a> {
75 fn new(text: &'a str) -> Self {
76 Self { text, pos: 0 }
77 }
78}
79
80impl<'a> Iterator for WordBoundaryIter<'a> {
81 type Item = (usize, &'a str);
82
83 fn next(&mut self) -> Option<Self::Item> {
84 let bytes = self.text.as_bytes();
85 while self.pos < bytes.len() {
87 let c = self.text[self.pos..].chars().next()?;
88 if c.is_alphanumeric() || c == '-' || c == '_' {
89 break;
90 }
91 self.pos += c.len_utf8();
92 }
93 if self.pos >= bytes.len() {
94 return None;
95 }
96
97 let start = self.pos;
98 while self.pos < bytes.len() {
100 let Some(c) = self.text[self.pos..].chars().next() else {
101 break;
102 };
103 if c.is_alphanumeric() || c == '-' || c == '_' {
104 self.pos += c.len_utf8();
105 } else {
106 break;
107 }
108 }
109 let word = &self.text[start..self.pos];
110 let trimmed_start = start + word.len() - word.trim_start_matches(['-', '_']).len();
111 let trimmed_end = trimmed_start + word.trim_matches(['-', '_']).len();
112 if trimmed_start >= trimmed_end {
113 return self.next();
114 }
115 Some((trimmed_start, &self.text[trimmed_start..trimmed_end]))
116 }
117}
118
119#[cfg(test)]
120mod tests {
121 use crate::backend::memory::MemoryBackend;
122 use crate::index::FtsIndex;
123
124 #[test]
125 fn highlight_basic() {
126 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
127 let result = idx.highlight(
128 "The quick brown fox jumps over the lazy dog",
129 "brown fox",
130 "<b>",
131 "</b>",
132 );
133 assert!(result.contains("<b>brown</b>"));
134 assert!(result.contains("<b>fox</b>"));
135 }
136
137 #[test]
138 fn highlight_no_match() {
139 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
140 let text = "hello world";
141 let result = idx.highlight(text, "xyz", "<b>", "</b>");
142 assert_eq!(result, text);
143 }
144
145 #[test]
146 fn offsets_basic() {
147 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
148 let offsets = idx.offsets("The quick brown fox", "brown");
149 assert_eq!(offsets.len(), 1);
150 assert_eq!(offsets[0].start, 10);
151 assert_eq!(offsets[0].end, 15);
152 }
153
154 #[test]
155 fn word_boundary_iter() {
156 let iter = super::WordBoundaryIter::new("hello, world! foo-bar");
157 let words: Vec<_> = iter.collect();
158 assert!(words.iter().any(|(_, w)| *w == "hello"));
159 assert!(words.iter().any(|(_, w)| *w == "world"));
160 assert!(words.iter().any(|(_, w)| *w == "foo-bar"));
161 }
162}