1use crate::text;
6
7#[derive(Debug, Clone)]
9pub struct SearchResult {
10 pub page: u32,
12 pub text: String,
14 pub bounding_boxes: Vec<[f64; 4]>,
16 pub offset: usize,
18}
19
20#[derive(Debug, Clone)]
22pub struct SearchOptions {
23 pub case_insensitive: bool,
25 pub max_results: usize,
27 pub pages: Vec<u32>,
29 pub skip_bounding_boxes: bool,
31}
32
33impl Default for SearchOptions {
34 fn default() -> Self {
35 Self {
36 case_insensitive: true,
37 max_results: 0,
38 pages: Vec::new(),
39 skip_bounding_boxes: false,
40 }
41 }
42}
43
44pub fn search_text(
46 doc: &lopdf::Document,
47 query: &str,
48 options: &SearchOptions,
49) -> Vec<SearchResult> {
50 if query.is_empty() {
51 return Vec::new();
52 }
53
54 let pages = doc.get_pages();
55 let total = pages.len() as u32;
56 let mut results = Vec::new();
57
58 let page_nums: Vec<u32> = if options.pages.is_empty() {
60 (1..=total).collect()
61 } else {
62 options
63 .pages
64 .iter()
65 .copied()
66 .filter(|&p| p >= 1 && p <= total)
67 .collect()
68 };
69
70 let query_normalized = if options.case_insensitive {
71 query.to_lowercase()
72 } else {
73 query.to_string()
74 };
75
76 for page_num in &page_nums {
79 let page_text = text::extract_page_text(doc, *page_num).unwrap_or_default();
80
81 let haystack = if options.case_insensitive {
82 page_text.to_lowercase()
83 } else {
84 page_text.clone()
85 };
86
87 let positioned = if options.skip_bounding_boxes {
89 Vec::new()
90 } else {
91 text::extract_positioned_chars(doc, *page_num).unwrap_or_default()
92 };
93
94 let mut start = 0;
95 while let Some(pos) = haystack[start..].find(&query_normalized) {
96 let offset = start + pos;
97 let end = offset + query_normalized.len();
98
99 let bboxes: Vec<[f64; 4]> = if options.skip_bounding_boxes {
100 Vec::new()
101 } else {
102 positioned
103 .iter()
104 .skip(offset)
105 .take(end - offset)
106 .map(|c| c.bbox)
107 .collect()
108 };
109
110 let matched_text = page_text
111 .chars()
112 .skip(offset)
113 .take(query_normalized.len())
114 .collect::<String>();
115
116 results.push(SearchResult {
117 page: *page_num,
118 text: matched_text,
119 bounding_boxes: bboxes,
120 offset,
121 });
122
123 if options.max_results > 0 && results.len() >= options.max_results {
124 return results;
125 }
126
127 start = offset + 1;
128 }
129 }
130
131 results
132}
133
134pub fn count_occurrences(doc: &lopdf::Document, query: &str) -> usize {
136 count_text_only(doc, query, &SearchOptions::default())
137}
138
139pub fn count_text_only(doc: &lopdf::Document, query: &str, options: &SearchOptions) -> usize {
144 if query.is_empty() {
145 return 0;
146 }
147
148 let pages = doc.get_pages();
149 let total = pages.len() as u32;
150
151 let page_nums: Vec<u32> = if options.pages.is_empty() {
152 (1..=total).collect()
153 } else {
154 options
155 .pages
156 .iter()
157 .copied()
158 .filter(|&p| p >= 1 && p <= total)
159 .collect()
160 };
161
162 let query_normalized = if options.case_insensitive {
163 query.to_lowercase()
164 } else {
165 query.to_string()
166 };
167
168 let mut count = 0usize;
169 for page_num in &page_nums {
170 let page_text = text::extract_page_text(doc, *page_num).unwrap_or_default();
171 let haystack = if options.case_insensitive {
172 page_text.to_lowercase()
173 } else {
174 page_text
175 };
176
177 let mut start = 0;
178 while let Some(pos) = haystack[start..].find(&query_normalized) {
179 count += 1;
180 start += pos + 1;
181 }
182 }
183
184 count
185}
186
187pub fn pages_containing(doc: &lopdf::Document, query: &str) -> Vec<u32> {
189 if query.is_empty() {
190 return Vec::new();
191 }
192
193 let pages = doc.get_pages();
194 let total = pages.len() as u32;
195 let query_lower = query.to_lowercase();
196 let mut result = Vec::new();
197
198 for page_num in 1..=total {
199 let page_text = text::extract_page_text(doc, page_num).unwrap_or_default();
200 if page_text.to_lowercase().contains(&query_lower) {
201 result.push(page_num);
202 }
203 }
204
205 result
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211 use lopdf::{dictionary, Document, Object, Stream};
212
213 fn make_doc_with_text(content: &[u8]) -> Document {
215 let mut doc = Document::with_version("1.7");
216
217 let content_stream = Stream::new(dictionary! {}, content.to_vec());
218 let content_id = doc.add_object(Object::Stream(content_stream));
219
220 let page_dict = dictionary! {
221 "Type" => "Page",
222 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
223 "Contents" => Object::Reference(content_id),
224 };
225 let page_id = doc.add_object(Object::Dictionary(page_dict));
226
227 let pages_dict = dictionary! {
228 "Type" => "Pages",
229 "Kids" => vec![Object::Reference(page_id)],
230 "Count" => 1_i64,
231 };
232 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
233
234 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
235 d.set("Parent", Object::Reference(pages_id));
236 }
237
238 let catalog = dictionary! {
239 "Type" => "Catalog",
240 "Pages" => Object::Reference(pages_id),
241 };
242 let catalog_id = doc.add_object(Object::Dictionary(catalog));
243 doc.trailer.set("Root", Object::Reference(catalog_id));
244
245 doc
246 }
247
248 fn make_multi_page_doc(contents: &[&[u8]]) -> Document {
250 let mut doc = Document::with_version("1.7");
251 let mut page_ids = Vec::new();
252
253 for content in contents {
254 let content_stream = Stream::new(dictionary! {}, content.to_vec());
255 let content_id = doc.add_object(Object::Stream(content_stream));
256
257 let page_dict = dictionary! {
258 "Type" => "Page",
259 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
260 "Contents" => Object::Reference(content_id),
261 };
262 let page_id = doc.add_object(Object::Dictionary(page_dict));
263 page_ids.push(page_id);
264 }
265
266 let kids: Vec<Object> = page_ids.iter().map(|id| Object::Reference(*id)).collect();
267 let pages_dict = dictionary! {
268 "Type" => "Pages",
269 "Kids" => kids,
270 "Count" => Object::Integer(page_ids.len() as i64),
271 };
272 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
273
274 for &page_id in &page_ids {
275 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
276 d.set("Parent", Object::Reference(pages_id));
277 }
278 }
279
280 let catalog = dictionary! {
281 "Type" => "Catalog",
282 "Pages" => Object::Reference(pages_id),
283 };
284 let catalog_id = doc.add_object(Object::Dictionary(catalog));
285 doc.trailer.set("Root", Object::Reference(catalog_id));
286
287 doc
288 }
289
290 #[test]
291 fn search_single_page() {
292 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
293 let options = SearchOptions::default();
294 let results = search_text(&doc, "Hello", &options);
295 assert_eq!(results.len(), 1);
296 assert_eq!(results[0].page, 1);
297 assert_eq!(results[0].text, "Hello");
298 }
299
300 #[test]
301 fn search_case_insensitive() {
302 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
303 let options = SearchOptions {
304 case_insensitive: true,
305 ..Default::default()
306 };
307 let results = search_text(&doc, "hello", &options);
308 assert_eq!(results.len(), 1);
309 assert_eq!(results[0].text, "Hello");
310 }
311
312 #[test]
313 fn search_case_sensitive() {
314 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
315 let options = SearchOptions {
316 case_insensitive: false,
317 ..Default::default()
318 };
319 let results = search_text(&doc, "hello", &options);
320 assert!(results.is_empty());
321 }
322
323 #[test]
324 fn search_multiple_pages() {
325 let doc = make_multi_page_doc(&[
326 b"BT /F1 12 Tf (Hello) Tj ET",
327 b"BT /F1 12 Tf (Hello again) Tj ET",
328 ]);
329 let options = SearchOptions::default();
330 let results = search_text(&doc, "Hello", &options);
331 assert_eq!(results.len(), 2);
332 }
333
334 #[test]
335 fn search_specific_pages() {
336 let doc = make_multi_page_doc(&[
337 b"BT /F1 12 Tf (Hello) Tj ET",
338 b"BT /F1 12 Tf (Hello again) Tj ET",
339 ]);
340 let options = SearchOptions {
341 pages: vec![1],
342 ..Default::default()
343 };
344 let results = search_text(&doc, "Hello", &options);
345 assert_eq!(results.len(), 1);
346 assert_eq!(results[0].page, 1);
347 }
348
349 #[test]
350 fn search_max_results() {
351 let doc = make_doc_with_text(b"BT /F1 12 Tf (aaa) Tj ET");
352 let options = SearchOptions {
353 max_results: 1,
354 ..Default::default()
355 };
356 let results = search_text(&doc, "a", &options);
357 assert_eq!(results.len(), 1);
358 }
359
360 #[test]
361 fn search_empty_query() {
362 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
363 let options = SearchOptions::default();
364 let results = search_text(&doc, "", &options);
365 assert!(results.is_empty());
366 }
367
368 #[test]
369 fn search_no_match() {
370 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
371 let options = SearchOptions::default();
372 let results = search_text(&doc, "xyz", &options);
373 assert!(results.is_empty());
374 }
375
376 #[test]
377 fn count_occurrences_basic() {
378 let doc = make_doc_with_text(b"BT /F1 12 Tf (abcabc) Tj ET");
379 let count = count_occurrences(&doc, "abc");
380 assert_eq!(count, 2);
381 }
382
383 #[test]
384 fn pages_containing_basic() {
385 let doc = make_multi_page_doc(&[
386 b"BT /F1 12 Tf (Hello) Tj ET",
387 b"BT /F1 12 Tf (World) Tj ET",
388 b"BT /F1 12 Tf (Hello World) Tj ET",
389 ]);
390 let pages = pages_containing(&doc, "Hello");
391 assert!(pages.contains(&1));
392 assert!(!pages.contains(&2));
393 assert!(pages.contains(&3));
394 }
395
396 #[test]
397 fn search_results_have_bounding_boxes() {
398 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
399 let options = SearchOptions::default();
400 let results = search_text(&doc, "Hello", &options);
401 assert_eq!(results.len(), 1);
402 assert_eq!(results[0].bounding_boxes.len(), 5); }
404}