1use crate::analyzer::pipeline::analyze;
6use crate::backend::FtsBackend;
7use crate::index::FtsIndex;
8use crate::posting::MatchOffset;
9
10impl<B: FtsBackend> FtsIndex<B> {
11 pub fn highlight(&self, text: &str, query: &str, prefix: &str, suffix: &str) -> String {
16 let matches = self.find_query_matches(text, query);
17 if matches.is_empty() {
18 return text.to_string();
19 }
20
21 let mut result =
22 String::with_capacity(text.len() + matches.len() * (prefix.len() + suffix.len()) * 2);
23 let mut last_end = 0;
24
25 for m in &matches {
26 result.push_str(&text[last_end..m.start]);
27 result.push_str(prefix);
28 result.push_str(&text[m.start..m.end]);
29 result.push_str(suffix);
30 last_end = m.end;
31 }
32 result.push_str(&text[last_end..]);
33 result
34 }
35
36 pub fn offsets(&self, text: &str, query: &str) -> Vec<MatchOffset> {
38 self.find_query_matches(text, query)
39 }
40
41 fn find_query_matches(&self, text: &str, query: &str) -> Vec<MatchOffset> {
43 let query_tokens = analyze(query);
44 if query_tokens.is_empty() {
45 return Vec::new();
46 }
47
48 let query_set: std::collections::HashSet<&str> =
49 query_tokens.iter().map(String::as_str).collect();
50 let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
51
52 let mut matches = Vec::new();
53 for (start, word) in WordBoundaryIter::new(text) {
54 let lower = word.to_lowercase();
55 let stemmed = stemmer.stem(&lower);
56 if query_set.contains(stemmed.as_ref()) {
57 matches.push(MatchOffset {
58 start,
59 end: start + word.len(),
60 term: stemmed.into_owned(),
61 });
62 }
63 }
64 matches
65 }
66}
67
68struct WordBoundaryIter<'a> {
72 text: &'a str,
73 pos: usize,
74}
75
76impl<'a> WordBoundaryIter<'a> {
77 fn new(text: &'a str) -> Self {
78 Self { text, pos: 0 }
79 }
80}
81
82impl<'a> Iterator for WordBoundaryIter<'a> {
83 type Item = (usize, &'a str);
84
85 fn next(&mut self) -> Option<Self::Item> {
86 let bytes = self.text.as_bytes();
87 while self.pos < bytes.len() {
89 let c = self.text[self.pos..].chars().next()?;
90 if c.is_alphanumeric() || c == '-' || c == '_' {
91 break;
92 }
93 self.pos += c.len_utf8();
94 }
95 if self.pos >= bytes.len() {
96 return None;
97 }
98
99 let start = self.pos;
100 while self.pos < bytes.len() {
102 let Some(c) = self.text[self.pos..].chars().next() else {
103 break;
104 };
105 if c.is_alphanumeric() || c == '-' || c == '_' {
106 self.pos += c.len_utf8();
107 } else {
108 break;
109 }
110 }
111 let word = &self.text[start..self.pos];
112 let trimmed_start = start + word.len() - word.trim_start_matches(['-', '_']).len();
113 let trimmed_end = trimmed_start + word.trim_matches(['-', '_']).len();
114 if trimmed_start >= trimmed_end {
115 return self.next();
116 }
117 Some((trimmed_start, &self.text[trimmed_start..trimmed_end]))
118 }
119}
120
121#[cfg(test)]
122mod tests {
123 use crate::backend::memory::MemoryBackend;
124 use crate::index::FtsIndex;
125
126 #[test]
127 fn highlight_basic() {
128 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
129 let result = idx.highlight(
130 "The quick brown fox jumps over the lazy dog",
131 "brown fox",
132 "<b>",
133 "</b>",
134 );
135 assert!(result.contains("<b>brown</b>"));
136 assert!(result.contains("<b>fox</b>"));
137 }
138
139 #[test]
140 fn highlight_no_match() {
141 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
142 let text = "hello world";
143 let result = idx.highlight(text, "xyz", "<b>", "</b>");
144 assert_eq!(result, text);
145 }
146
147 #[test]
148 fn offsets_basic() {
149 let idx: FtsIndex<MemoryBackend> = FtsIndex::new(MemoryBackend::new());
150 let offsets = idx.offsets("The quick brown fox", "brown");
151 assert_eq!(offsets.len(), 1);
152 assert_eq!(offsets[0].start, 10);
153 assert_eq!(offsets[0].end, 15);
154 }
155
156 #[test]
157 fn word_boundary_iter() {
158 let iter = super::WordBoundaryIter::new("hello, world! foo-bar");
159 let words: Vec<_> = iter.collect();
160 assert!(words.iter().any(|(_, w)| *w == "hello"));
161 assert!(words.iter().any(|(_, w)| *w == "world"));
162 assert!(words.iter().any(|(_, w)| *w == "foo-bar"));
163 }
164}