Skip to main content

pdfluent_extract/
search.rs

1//! Full-text search across PDF documents.
2//!
3//! Provides search functionality using extracted text and positioned characters.
4
5use crate::text;
6
7/// A search result with location information.
8#[derive(Debug, Clone)]
9pub struct SearchResult {
10    /// The page number (1-based) where the match was found.
11    pub page: u32,
12    /// The matched text.
13    pub text: String,
14    /// Bounding boxes of the matched characters.
15    pub bounding_boxes: Vec<[f64; 4]>,
16    /// Character offset within the page text.
17    pub offset: usize,
18}
19
20/// Options for text search.
21#[derive(Debug, Clone)]
22pub struct SearchOptions {
23    /// Whether to perform case-insensitive search.
24    pub case_insensitive: bool,
25    /// Maximum number of results to return (0 = unlimited).
26    pub max_results: usize,
27    /// Specific pages to search (empty = all pages).
28    pub pages: Vec<u32>,
29    /// Skip bounding box extraction for faster search when only text matches are needed.
30    pub skip_bounding_boxes: bool,
31}
32
33impl Default for SearchOptions {
34    fn default() -> Self {
35        Self {
36            case_insensitive: true,
37            max_results: 0,
38            pages: Vec::new(),
39            skip_bounding_boxes: false,
40        }
41    }
42}
43
44/// Search for text across all pages of a document.
45pub fn search_text(
46    doc: &lopdf::Document,
47    query: &str,
48    options: &SearchOptions,
49) -> Vec<SearchResult> {
50    if query.is_empty() {
51        return Vec::new();
52    }
53
54    let pages = doc.get_pages();
55    let total = pages.len() as u32;
56    let mut results = Vec::new();
57
58    // Determine which pages to search.
59    let page_nums: Vec<u32> = if options.pages.is_empty() {
60        (1..=total).collect()
61    } else {
62        options
63            .pages
64            .iter()
65            .copied()
66            .filter(|&p| p >= 1 && p <= total)
67            .collect()
68    };
69
70    let query_normalized = if options.case_insensitive {
71        query.to_lowercase()
72    } else {
73        query.to_string()
74    };
75
76    // Process pages lazily — extract text (and optionally positioned chars)
77    // one page at a time to avoid upfront cost on large documents.
78    for page_num in &page_nums {
79        let page_text = text::extract_page_text(doc, *page_num).unwrap_or_default();
80
81        let haystack = if options.case_insensitive {
82            page_text.to_lowercase()
83        } else {
84            page_text.clone()
85        };
86
87        // Only extract positioned chars when bounding boxes are actually needed.
88        let positioned = if options.skip_bounding_boxes {
89            Vec::new()
90        } else {
91            text::extract_positioned_chars(doc, *page_num).unwrap_or_default()
92        };
93
94        let mut start = 0;
95        while let Some(pos) = haystack[start..].find(&query_normalized) {
96            let offset = start + pos;
97            let end = offset + query_normalized.len();
98
99            let bboxes: Vec<[f64; 4]> = if options.skip_bounding_boxes {
100                Vec::new()
101            } else {
102                positioned
103                    .iter()
104                    .skip(offset)
105                    .take(end - offset)
106                    .map(|c| c.bbox)
107                    .collect()
108            };
109
110            let matched_text = page_text
111                .chars()
112                .skip(offset)
113                .take(query_normalized.len())
114                .collect::<String>();
115
116            results.push(SearchResult {
117                page: *page_num,
118                text: matched_text,
119                bounding_boxes: bboxes,
120                offset,
121            });
122
123            if options.max_results > 0 && results.len() >= options.max_results {
124                return results;
125            }
126
127            start = offset + 1;
128        }
129    }
130
131    results
132}
133
134/// Count the total number of occurrences of a query across all pages.
135pub fn count_occurrences(doc: &lopdf::Document, query: &str) -> usize {
136    count_text_only(doc, query, &SearchOptions::default())
137}
138
139/// Fast text-only occurrence count — skips all bounding box extraction.
140///
141/// Accepts `SearchOptions` for page filtering and case sensitivity,
142/// but ignores `max_results` (always counts all occurrences).
143pub fn count_text_only(doc: &lopdf::Document, query: &str, options: &SearchOptions) -> usize {
144    if query.is_empty() {
145        return 0;
146    }
147
148    let pages = doc.get_pages();
149    let total = pages.len() as u32;
150
151    let page_nums: Vec<u32> = if options.pages.is_empty() {
152        (1..=total).collect()
153    } else {
154        options
155            .pages
156            .iter()
157            .copied()
158            .filter(|&p| p >= 1 && p <= total)
159            .collect()
160    };
161
162    let query_normalized = if options.case_insensitive {
163        query.to_lowercase()
164    } else {
165        query.to_string()
166    };
167
168    let mut count = 0usize;
169    for page_num in &page_nums {
170        let page_text = text::extract_page_text(doc, *page_num).unwrap_or_default();
171        let haystack = if options.case_insensitive {
172            page_text.to_lowercase()
173        } else {
174            page_text
175        };
176
177        let mut start = 0;
178        while let Some(pos) = haystack[start..].find(&query_normalized) {
179            count += 1;
180            start += pos + 1;
181        }
182    }
183
184    count
185}
186
187/// Return a list of page numbers that contain the query text.
188pub fn pages_containing(doc: &lopdf::Document, query: &str) -> Vec<u32> {
189    if query.is_empty() {
190        return Vec::new();
191    }
192
193    let pages = doc.get_pages();
194    let total = pages.len() as u32;
195    let query_lower = query.to_lowercase();
196    let mut result = Vec::new();
197
198    for page_num in 1..=total {
199        let page_text = text::extract_page_text(doc, page_num).unwrap_or_default();
200        if page_text.to_lowercase().contains(&query_lower) {
201            result.push(page_num);
202        }
203    }
204
205    result
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211    use lopdf::{dictionary, Document, Object, Stream};
212
213    /// Helper: create a doc with text content.
214    fn make_doc_with_text(content: &[u8]) -> Document {
215        let mut doc = Document::with_version("1.7");
216
217        let content_stream = Stream::new(dictionary! {}, content.to_vec());
218        let content_id = doc.add_object(Object::Stream(content_stream));
219
220        let page_dict = dictionary! {
221            "Type" => "Page",
222            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
223            "Contents" => Object::Reference(content_id),
224        };
225        let page_id = doc.add_object(Object::Dictionary(page_dict));
226
227        let pages_dict = dictionary! {
228            "Type" => "Pages",
229            "Kids" => vec![Object::Reference(page_id)],
230            "Count" => 1_i64,
231        };
232        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
233
234        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
235            d.set("Parent", Object::Reference(pages_id));
236        }
237
238        let catalog = dictionary! {
239            "Type" => "Catalog",
240            "Pages" => Object::Reference(pages_id),
241        };
242        let catalog_id = doc.add_object(Object::Dictionary(catalog));
243        doc.trailer.set("Root", Object::Reference(catalog_id));
244
245        doc
246    }
247
248    /// Helper: create a multi-page doc.
249    fn make_multi_page_doc(contents: &[&[u8]]) -> Document {
250        let mut doc = Document::with_version("1.7");
251        let mut page_ids = Vec::new();
252
253        for content in contents {
254            let content_stream = Stream::new(dictionary! {}, content.to_vec());
255            let content_id = doc.add_object(Object::Stream(content_stream));
256
257            let page_dict = dictionary! {
258                "Type" => "Page",
259                "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
260                "Contents" => Object::Reference(content_id),
261            };
262            let page_id = doc.add_object(Object::Dictionary(page_dict));
263            page_ids.push(page_id);
264        }
265
266        let kids: Vec<Object> = page_ids.iter().map(|id| Object::Reference(*id)).collect();
267        let pages_dict = dictionary! {
268            "Type" => "Pages",
269            "Kids" => kids,
270            "Count" => Object::Integer(page_ids.len() as i64),
271        };
272        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
273
274        for &page_id in &page_ids {
275            if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
276                d.set("Parent", Object::Reference(pages_id));
277            }
278        }
279
280        let catalog = dictionary! {
281            "Type" => "Catalog",
282            "Pages" => Object::Reference(pages_id),
283        };
284        let catalog_id = doc.add_object(Object::Dictionary(catalog));
285        doc.trailer.set("Root", Object::Reference(catalog_id));
286
287        doc
288    }
289
290    #[test]
291    fn search_single_page() {
292        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
293        let options = SearchOptions::default();
294        let results = search_text(&doc, "Hello", &options);
295        assert_eq!(results.len(), 1);
296        assert_eq!(results[0].page, 1);
297        assert_eq!(results[0].text, "Hello");
298    }
299
300    #[test]
301    fn search_case_insensitive() {
302        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
303        let options = SearchOptions {
304            case_insensitive: true,
305            ..Default::default()
306        };
307        let results = search_text(&doc, "hello", &options);
308        assert_eq!(results.len(), 1);
309        assert_eq!(results[0].text, "Hello");
310    }
311
312    #[test]
313    fn search_case_sensitive() {
314        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
315        let options = SearchOptions {
316            case_insensitive: false,
317            ..Default::default()
318        };
319        let results = search_text(&doc, "hello", &options);
320        assert!(results.is_empty());
321    }
322
323    #[test]
324    fn search_multiple_pages() {
325        let doc = make_multi_page_doc(&[
326            b"BT /F1 12 Tf (Hello) Tj ET",
327            b"BT /F1 12 Tf (Hello again) Tj ET",
328        ]);
329        let options = SearchOptions::default();
330        let results = search_text(&doc, "Hello", &options);
331        assert_eq!(results.len(), 2);
332    }
333
334    #[test]
335    fn search_specific_pages() {
336        let doc = make_multi_page_doc(&[
337            b"BT /F1 12 Tf (Hello) Tj ET",
338            b"BT /F1 12 Tf (Hello again) Tj ET",
339        ]);
340        let options = SearchOptions {
341            pages: vec![1],
342            ..Default::default()
343        };
344        let results = search_text(&doc, "Hello", &options);
345        assert_eq!(results.len(), 1);
346        assert_eq!(results[0].page, 1);
347    }
348
349    #[test]
350    fn search_max_results() {
351        let doc = make_doc_with_text(b"BT /F1 12 Tf (aaa) Tj ET");
352        let options = SearchOptions {
353            max_results: 1,
354            ..Default::default()
355        };
356        let results = search_text(&doc, "a", &options);
357        assert_eq!(results.len(), 1);
358    }
359
360    #[test]
361    fn search_empty_query() {
362        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
363        let options = SearchOptions::default();
364        let results = search_text(&doc, "", &options);
365        assert!(results.is_empty());
366    }
367
368    #[test]
369    fn search_no_match() {
370        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
371        let options = SearchOptions::default();
372        let results = search_text(&doc, "xyz", &options);
373        assert!(results.is_empty());
374    }
375
376    #[test]
377    fn count_occurrences_basic() {
378        let doc = make_doc_with_text(b"BT /F1 12 Tf (abcabc) Tj ET");
379        let count = count_occurrences(&doc, "abc");
380        assert_eq!(count, 2);
381    }
382
383    #[test]
384    fn pages_containing_basic() {
385        let doc = make_multi_page_doc(&[
386            b"BT /F1 12 Tf (Hello) Tj ET",
387            b"BT /F1 12 Tf (World) Tj ET",
388            b"BT /F1 12 Tf (Hello World) Tj ET",
389        ]);
390        let pages = pages_containing(&doc, "Hello");
391        assert!(pages.contains(&1));
392        assert!(!pages.contains(&2));
393        assert!(pages.contains(&3));
394    }
395
396    #[test]
397    fn search_results_have_bounding_boxes() {
398        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
399        let options = SearchOptions::default();
400        let results = search_text(&doc, "Hello", &options);
401        assert_eq!(results.len(), 1);
402        assert_eq!(results[0].bounding_boxes.len(), 5); // 5 chars in "Hello"
403    }
404}