1use regex::Regex;
4
5use crate::geometry::BBox;
6
7#[derive(Debug, Clone)]
9#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
10pub struct SearchOptions {
11 pub regex: bool,
14 pub case_sensitive: bool,
16}
17
18impl Default for SearchOptions {
19 fn default() -> Self {
20 Self {
21 regex: true,
22 case_sensitive: true,
23 }
24 }
25}
26
27#[derive(Debug, Clone, PartialEq)]
29#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30pub struct SearchMatch {
31 pub text: String,
33 pub bbox: BBox,
35 pub page_number: usize,
37 pub char_indices: Vec<usize>,
39}
40
41pub fn search_chars(
60 chars: &[crate::text::Char],
61 pattern: &str,
62 options: &SearchOptions,
63 page_number: usize,
64) -> Vec<SearchMatch> {
65 if chars.is_empty() || pattern.is_empty() {
66 return Vec::new();
67 }
68
69 let mut full_text = String::new();
72 let mut byte_to_char_idx: Vec<usize> = Vec::new();
74
75 for (i, ch) in chars.iter().enumerate() {
76 let start = full_text.len();
77 full_text.push_str(&ch.text);
78 let end = full_text.len();
79 for _ in start..end {
80 byte_to_char_idx.push(i);
81 }
82 }
83
84 let regex_pattern = if options.regex {
86 if options.case_sensitive {
87 pattern.to_string()
88 } else {
89 format!("(?i){pattern}")
90 }
91 } else {
92 let escaped = regex::escape(pattern);
93 if options.case_sensitive {
94 escaped
95 } else {
96 format!("(?i){escaped}")
97 }
98 };
99
100 let re = match Regex::new(®ex_pattern) {
101 Ok(re) => re,
102 Err(_) => return Vec::new(),
103 };
104
105 let mut results = Vec::new();
106
107 for m in re.find_iter(&full_text) {
108 let match_start = m.start();
109 let match_end = m.end();
110
111 if match_start >= byte_to_char_idx.len() || match_end == 0 {
112 continue;
113 }
114
115 let mut char_indices: Vec<usize> = Vec::new();
117 for byte_offset in match_start..match_end {
118 if byte_offset < byte_to_char_idx.len() {
119 let idx = byte_to_char_idx[byte_offset];
120 if char_indices.last() != Some(&idx) {
121 char_indices.push(idx);
122 }
123 }
124 }
125
126 if char_indices.is_empty() {
127 continue;
128 }
129
130 let mut bbox = chars[char_indices[0]].bbox;
132 for &idx in &char_indices[1..] {
133 bbox = bbox.union(&chars[idx].bbox);
134 }
135
136 results.push(SearchMatch {
137 text: m.as_str().to_string(),
138 bbox,
139 page_number,
140 char_indices,
141 });
142 }
143
144 results
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150 use crate::text::{Char, TextDirection};
151
152 fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
153 Char {
154 text: text.to_string(),
155 bbox: BBox::new(x0, top, x1, bottom),
156 fontname: "TestFont".to_string(),
157 size: 12.0,
158 doctop: top,
159 upright: true,
160 direction: TextDirection::Ltr,
161 stroking_color: None,
162 non_stroking_color: None,
163 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
164 char_code: 0,
165 mcid: None,
166 tag: None,
167 }
168 }
169
170 #[test]
171 fn search_options_defaults() {
172 let opts = SearchOptions::default();
173 assert!(opts.regex);
174 assert!(opts.case_sensitive);
175 }
176
177 #[test]
178 fn simple_string_search() {
179 let chars = vec![
181 make_char("H", 10.0, 100.0, 18.0, 112.0),
182 make_char("e", 18.0, 100.0, 26.0, 112.0),
183 make_char("l", 26.0, 100.0, 30.0, 112.0),
184 make_char("l", 30.0, 100.0, 34.0, 112.0),
185 make_char("o", 34.0, 100.0, 42.0, 112.0),
186 make_char(" ", 42.0, 100.0, 46.0, 112.0),
187 make_char("W", 46.0, 100.0, 56.0, 112.0),
188 make_char("o", 56.0, 100.0, 64.0, 112.0),
189 make_char("r", 64.0, 100.0, 70.0, 112.0),
190 make_char("l", 70.0, 100.0, 74.0, 112.0),
191 make_char("d", 74.0, 100.0, 82.0, 112.0),
192 ];
193 let opts = SearchOptions {
194 regex: false,
195 ..Default::default()
196 };
197 let matches = search_chars(&chars, "World", &opts, 0);
198
199 assert_eq!(matches.len(), 1);
200 assert_eq!(matches[0].text, "World");
201 assert_eq!(matches[0].page_number, 0);
202 assert_eq!(matches[0].char_indices, vec![6, 7, 8, 9, 10]);
203 assert_eq!(matches[0].bbox, BBox::new(46.0, 100.0, 82.0, 112.0));
205 }
206
207 #[test]
208 fn regex_search() {
209 let chars = vec![
211 make_char("H", 10.0, 100.0, 18.0, 112.0),
212 make_char("e", 18.0, 100.0, 26.0, 112.0),
213 make_char("l", 26.0, 100.0, 30.0, 112.0),
214 make_char("l", 30.0, 100.0, 34.0, 112.0),
215 make_char("o", 34.0, 100.0, 42.0, 112.0),
216 make_char(" ", 42.0, 100.0, 46.0, 112.0),
217 make_char("W", 46.0, 100.0, 56.0, 112.0),
218 ];
219 let opts = SearchOptions::default(); let matches = search_chars(&chars, "H.llo", &opts, 0);
221
222 assert_eq!(matches.len(), 1);
223 assert_eq!(matches[0].text, "Hello");
224 assert_eq!(matches[0].char_indices, vec![0, 1, 2, 3, 4]);
225 assert_eq!(matches[0].bbox, BBox::new(10.0, 100.0, 42.0, 112.0));
226 }
227
228 #[test]
229 fn case_insensitive_search() {
230 let chars = vec![
231 make_char("H", 10.0, 100.0, 18.0, 112.0),
232 make_char("e", 18.0, 100.0, 26.0, 112.0),
233 make_char("l", 26.0, 100.0, 30.0, 112.0),
234 make_char("l", 30.0, 100.0, 34.0, 112.0),
235 make_char("o", 34.0, 100.0, 42.0, 112.0),
236 ];
237 let opts = SearchOptions {
239 regex: false,
240 case_sensitive: false,
241 };
242 let matches = search_chars(&chars, "hello", &opts, 0);
243
244 assert_eq!(matches.len(), 1);
245 assert_eq!(matches[0].text, "Hello");
246 }
247
248 #[test]
249 fn case_sensitive_no_match() {
250 let chars = vec![
251 make_char("H", 10.0, 100.0, 18.0, 112.0),
252 make_char("i", 18.0, 100.0, 26.0, 112.0),
253 ];
254 let opts = SearchOptions {
255 regex: false,
256 case_sensitive: true,
257 };
258 let matches = search_chars(&chars, "hi", &opts, 0);
259
260 assert!(matches.is_empty());
261 }
262
263 #[test]
264 fn multi_word_match_bbox() {
265 let chars = vec![
267 make_char("H", 10.0, 100.0, 18.0, 112.0),
268 make_char("e", 18.0, 100.0, 26.0, 112.0),
269 make_char("l", 26.0, 100.0, 30.0, 112.0),
270 make_char("l", 30.0, 100.0, 34.0, 112.0),
271 make_char("o", 34.0, 100.0, 42.0, 112.0),
272 make_char(" ", 42.0, 100.0, 46.0, 112.0),
273 make_char("W", 46.0, 100.0, 56.0, 112.0),
274 make_char("o", 56.0, 100.0, 64.0, 112.0),
275 ];
276 let opts = SearchOptions {
277 regex: false,
278 ..Default::default()
279 };
280 let matches = search_chars(&chars, "lo Wo", &opts, 0);
281
282 assert_eq!(matches.len(), 1);
283 assert_eq!(matches[0].text, "lo Wo");
284 assert_eq!(matches[0].char_indices, vec![3, 4, 5, 6, 7]);
286 assert_eq!(matches[0].bbox, BBox::new(30.0, 100.0, 64.0, 112.0));
287 }
288
289 #[test]
290 fn no_matches_returns_empty() {
291 let chars = vec![
292 make_char("A", 10.0, 100.0, 20.0, 112.0),
293 make_char("B", 20.0, 100.0, 30.0, 112.0),
294 ];
295 let opts = SearchOptions::default();
296 let matches = search_chars(&chars, "XYZ", &opts, 0);
297
298 assert!(matches.is_empty());
299 }
300
301 #[test]
302 fn empty_chars_returns_empty() {
303 let matches = search_chars(&[], "test", &SearchOptions::default(), 0);
304 assert!(matches.is_empty());
305 }
306
307 #[test]
308 fn empty_pattern_returns_empty() {
309 let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
310 let matches = search_chars(&chars, "", &SearchOptions::default(), 0);
311 assert!(matches.is_empty());
312 }
313
314 #[test]
315 fn multiple_matches() {
316 let chars = vec![
318 make_char("a", 10.0, 100.0, 18.0, 112.0),
319 make_char("b", 18.0, 100.0, 26.0, 112.0),
320 make_char("a", 26.0, 100.0, 34.0, 112.0),
321 make_char("b", 34.0, 100.0, 42.0, 112.0),
322 ];
323 let opts = SearchOptions {
324 regex: false,
325 ..Default::default()
326 };
327 let matches = search_chars(&chars, "ab", &opts, 0);
328
329 assert_eq!(matches.len(), 2);
330 assert_eq!(matches[0].text, "ab");
331 assert_eq!(matches[0].char_indices, vec![0, 1]);
332 assert_eq!(matches[1].text, "ab");
333 assert_eq!(matches[1].char_indices, vec![2, 3]);
334 }
335
336 #[test]
337 fn multiline_match_bbox() {
338 let chars = vec![
340 make_char("A", 10.0, 100.0, 20.0, 112.0),
341 make_char("B", 20.0, 100.0, 30.0, 112.0),
342 make_char("C", 10.0, 120.0, 20.0, 132.0),
343 ];
344 let opts = SearchOptions {
345 regex: false,
346 ..Default::default()
347 };
348 let matches = search_chars(&chars, "ABC", &opts, 0);
349
350 assert_eq!(matches.len(), 1);
351 assert_eq!(matches[0].text, "ABC");
352 assert_eq!(matches[0].bbox, BBox::new(10.0, 100.0, 30.0, 132.0));
354 }
355
356 #[test]
357 fn invalid_regex_returns_empty() {
358 let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
359 let opts = SearchOptions {
360 regex: true,
361 ..Default::default()
362 };
363 let matches = search_chars(&chars, "[invalid", &opts, 0);
364 assert!(matches.is_empty());
365 }
366
367 #[test]
368 fn regex_case_insensitive() {
369 let chars = vec![
370 make_char("H", 10.0, 100.0, 18.0, 112.0),
371 make_char("i", 18.0, 100.0, 26.0, 112.0),
372 ];
373 let opts = SearchOptions {
374 regex: true,
375 case_sensitive: false,
376 };
377 let matches = search_chars(&chars, "h.", &opts, 0);
378
379 assert_eq!(matches.len(), 1);
380 assert_eq!(matches[0].text, "Hi");
381 }
382
383 #[test]
384 fn page_number_in_result() {
385 let chars = vec![
386 make_char("A", 10.0, 100.0, 20.0, 112.0),
387 make_char("B", 20.0, 100.0, 30.0, 112.0),
388 ];
389 let opts = SearchOptions {
390 regex: false,
391 ..Default::default()
392 };
393 let matches = search_chars(&chars, "AB", &opts, 5);
394
395 assert_eq!(matches.len(), 1);
396 assert_eq!(matches[0].page_number, 5);
397 }
398}