Skip to main content

pdfplumber_core/
search.rs

1//! Text search with position — find text patterns and return matches with bounding boxes.
2
3use regex::Regex;
4
5use crate::geometry::BBox;
6
7/// Options controlling text search behavior.
8#[derive(Debug, Clone)]
9#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
10pub struct SearchOptions {
11    /// Whether to interpret the pattern as a regex (default: `true`).
12    /// When `false`, the pattern is treated as a literal string.
13    pub regex: bool,
14    /// Whether the search is case-sensitive (default: `true`).
15    pub case_sensitive: bool,
16}
17
18impl Default for SearchOptions {
19    fn default() -> Self {
20        Self {
21            regex: true,
22            case_sensitive: true,
23        }
24    }
25}
26
27/// A single text search match with its bounding box and position information.
28#[derive(Debug, Clone, PartialEq)]
29#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30pub struct SearchMatch {
31    /// The matched text.
32    pub text: String,
33    /// Union bounding box of all constituent characters.
34    pub bbox: BBox,
35    /// Page number (0-indexed).
36    pub page_number: usize,
37    /// Indices into the page's char array for the matched characters.
38    pub char_indices: Vec<usize>,
39}
40
41/// Search for a pattern in a sequence of characters and return matches with bounding boxes.
42///
43/// The algorithm:
44/// 1. Concatenate all char texts into a single string, tracking byte-offset → char-index mapping.
45/// 2. Run the pattern (regex or literal) against the concatenated string.
46/// 3. For each match, determine which chars contribute and compute the union bbox.
47///
48/// # Arguments
49///
50/// * `chars` - The characters to search within (from a page).
51/// * `pattern` - The search pattern (regex or literal string).
52/// * `options` - Search options (regex mode, case sensitivity).
53/// * `page_number` - The page number for the returned matches.
54///
55/// # Returns
56///
57/// A vector of [`SearchMatch`] with bounding boxes computed from constituent chars.
58/// Returns an empty vector if the pattern is invalid or no matches are found.
59pub fn search_chars(
60    chars: &[crate::text::Char],
61    pattern: &str,
62    options: &SearchOptions,
63    page_number: usize,
64) -> Vec<SearchMatch> {
65    if chars.is_empty() || pattern.is_empty() {
66        return Vec::new();
67    }
68
69    // Build the concatenated text and mapping from byte offset to char index.
70    // Each char's text maps to a range of byte offsets in the concatenated string.
71    let mut full_text = String::new();
72    // byte_to_char_idx[byte_offset] = index into chars array
73    let mut byte_to_char_idx: Vec<usize> = Vec::new();
74
75    for (i, ch) in chars.iter().enumerate() {
76        let start = full_text.len();
77        full_text.push_str(&ch.text);
78        let end = full_text.len();
79        for _ in start..end {
80            byte_to_char_idx.push(i);
81        }
82    }
83
84    // Build the regex pattern
85    let regex_pattern = if options.regex {
86        if options.case_sensitive {
87            pattern.to_string()
88        } else {
89            format!("(?i){pattern}")
90        }
91    } else {
92        let escaped = regex::escape(pattern);
93        if options.case_sensitive {
94            escaped
95        } else {
96            format!("(?i){escaped}")
97        }
98    };
99
100    let re = match Regex::new(&regex_pattern) {
101        Ok(re) => re,
102        Err(_) => return Vec::new(),
103    };
104
105    let mut results = Vec::new();
106
107    for m in re.find_iter(&full_text) {
108        let match_start = m.start();
109        let match_end = m.end();
110
111        if match_start >= byte_to_char_idx.len() || match_end == 0 {
112            continue;
113        }
114
115        // Collect unique char indices for this match
116        let mut char_indices: Vec<usize> = Vec::new();
117        for byte_offset in match_start..match_end {
118            if byte_offset < byte_to_char_idx.len() {
119                let idx = byte_to_char_idx[byte_offset];
120                if char_indices.last() != Some(&idx) {
121                    char_indices.push(idx);
122                }
123            }
124        }
125
126        if char_indices.is_empty() {
127            continue;
128        }
129
130        // Compute the union bbox of matched chars
131        let mut bbox = chars[char_indices[0]].bbox;
132        for &idx in &char_indices[1..] {
133            bbox = bbox.union(&chars[idx].bbox);
134        }
135
136        results.push(SearchMatch {
137            text: m.as_str().to_string(),
138            bbox,
139            page_number,
140            char_indices,
141        });
142    }
143
144    results
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150    use crate::text::{Char, TextDirection};
151
152    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
153        Char {
154            text: text.to_string(),
155            bbox: BBox::new(x0, top, x1, bottom),
156            fontname: "TestFont".to_string(),
157            size: 12.0,
158            doctop: top,
159            upright: true,
160            direction: TextDirection::Ltr,
161            stroking_color: None,
162            non_stroking_color: None,
163            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
164            char_code: 0,
165            mcid: None,
166            tag: None,
167        }
168    }
169
170    #[test]
171    fn search_options_defaults() {
172        let opts = SearchOptions::default();
173        assert!(opts.regex);
174        assert!(opts.case_sensitive);
175    }
176
177    #[test]
178    fn simple_string_search() {
179        // "Hello World" — search for "World"
180        let chars = vec![
181            make_char("H", 10.0, 100.0, 18.0, 112.0),
182            make_char("e", 18.0, 100.0, 26.0, 112.0),
183            make_char("l", 26.0, 100.0, 30.0, 112.0),
184            make_char("l", 30.0, 100.0, 34.0, 112.0),
185            make_char("o", 34.0, 100.0, 42.0, 112.0),
186            make_char(" ", 42.0, 100.0, 46.0, 112.0),
187            make_char("W", 46.0, 100.0, 56.0, 112.0),
188            make_char("o", 56.0, 100.0, 64.0, 112.0),
189            make_char("r", 64.0, 100.0, 70.0, 112.0),
190            make_char("l", 70.0, 100.0, 74.0, 112.0),
191            make_char("d", 74.0, 100.0, 82.0, 112.0),
192        ];
193        let opts = SearchOptions {
194            regex: false,
195            ..Default::default()
196        };
197        let matches = search_chars(&chars, "World", &opts, 0);
198
199        assert_eq!(matches.len(), 1);
200        assert_eq!(matches[0].text, "World");
201        assert_eq!(matches[0].page_number, 0);
202        assert_eq!(matches[0].char_indices, vec![6, 7, 8, 9, 10]);
203        // Union bbox: x0=46, top=100, x1=82, bottom=112
204        assert_eq!(matches[0].bbox, BBox::new(46.0, 100.0, 82.0, 112.0));
205    }
206
207    #[test]
208    fn regex_search() {
209        // "Hello World" — search for regex "H.llo"
210        let chars = vec![
211            make_char("H", 10.0, 100.0, 18.0, 112.0),
212            make_char("e", 18.0, 100.0, 26.0, 112.0),
213            make_char("l", 26.0, 100.0, 30.0, 112.0),
214            make_char("l", 30.0, 100.0, 34.0, 112.0),
215            make_char("o", 34.0, 100.0, 42.0, 112.0),
216            make_char(" ", 42.0, 100.0, 46.0, 112.0),
217            make_char("W", 46.0, 100.0, 56.0, 112.0),
218        ];
219        let opts = SearchOptions::default(); // regex=true
220        let matches = search_chars(&chars, "H.llo", &opts, 0);
221
222        assert_eq!(matches.len(), 1);
223        assert_eq!(matches[0].text, "Hello");
224        assert_eq!(matches[0].char_indices, vec![0, 1, 2, 3, 4]);
225        assert_eq!(matches[0].bbox, BBox::new(10.0, 100.0, 42.0, 112.0));
226    }
227
228    #[test]
229    fn case_insensitive_search() {
230        let chars = vec![
231            make_char("H", 10.0, 100.0, 18.0, 112.0),
232            make_char("e", 18.0, 100.0, 26.0, 112.0),
233            make_char("l", 26.0, 100.0, 30.0, 112.0),
234            make_char("l", 30.0, 100.0, 34.0, 112.0),
235            make_char("o", 34.0, 100.0, 42.0, 112.0),
236        ];
237        // Search for "hello" (lowercase) with case_insensitive
238        let opts = SearchOptions {
239            regex: false,
240            case_sensitive: false,
241        };
242        let matches = search_chars(&chars, "hello", &opts, 0);
243
244        assert_eq!(matches.len(), 1);
245        assert_eq!(matches[0].text, "Hello");
246    }
247
248    #[test]
249    fn case_sensitive_no_match() {
250        let chars = vec![
251            make_char("H", 10.0, 100.0, 18.0, 112.0),
252            make_char("i", 18.0, 100.0, 26.0, 112.0),
253        ];
254        let opts = SearchOptions {
255            regex: false,
256            case_sensitive: true,
257        };
258        let matches = search_chars(&chars, "hi", &opts, 0);
259
260        assert!(matches.is_empty());
261    }
262
263    #[test]
264    fn multi_word_match_bbox() {
265        // "Hello World" — search for "lo Wo" spanning multiple words
266        let chars = vec![
267            make_char("H", 10.0, 100.0, 18.0, 112.0),
268            make_char("e", 18.0, 100.0, 26.0, 112.0),
269            make_char("l", 26.0, 100.0, 30.0, 112.0),
270            make_char("l", 30.0, 100.0, 34.0, 112.0),
271            make_char("o", 34.0, 100.0, 42.0, 112.0),
272            make_char(" ", 42.0, 100.0, 46.0, 112.0),
273            make_char("W", 46.0, 100.0, 56.0, 112.0),
274            make_char("o", 56.0, 100.0, 64.0, 112.0),
275        ];
276        let opts = SearchOptions {
277            regex: false,
278            ..Default::default()
279        };
280        let matches = search_chars(&chars, "lo Wo", &opts, 0);
281
282        assert_eq!(matches.len(), 1);
283        assert_eq!(matches[0].text, "lo Wo");
284        // chars[3] (l: 30-34), [4] (o: 34-42), [5] (space: 42-46), [6] (W: 46-56), [7] (o: 56-64)
285        assert_eq!(matches[0].char_indices, vec![3, 4, 5, 6, 7]);
286        assert_eq!(matches[0].bbox, BBox::new(30.0, 100.0, 64.0, 112.0));
287    }
288
289    #[test]
290    fn no_matches_returns_empty() {
291        let chars = vec![
292            make_char("A", 10.0, 100.0, 20.0, 112.0),
293            make_char("B", 20.0, 100.0, 30.0, 112.0),
294        ];
295        let opts = SearchOptions::default();
296        let matches = search_chars(&chars, "XYZ", &opts, 0);
297
298        assert!(matches.is_empty());
299    }
300
301    #[test]
302    fn empty_chars_returns_empty() {
303        let matches = search_chars(&[], "test", &SearchOptions::default(), 0);
304        assert!(matches.is_empty());
305    }
306
307    #[test]
308    fn empty_pattern_returns_empty() {
309        let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
310        let matches = search_chars(&chars, "", &SearchOptions::default(), 0);
311        assert!(matches.is_empty());
312    }
313
314    #[test]
315    fn multiple_matches() {
316        // "abab" — search for "ab" should return 2 matches
317        let chars = vec![
318            make_char("a", 10.0, 100.0, 18.0, 112.0),
319            make_char("b", 18.0, 100.0, 26.0, 112.0),
320            make_char("a", 26.0, 100.0, 34.0, 112.0),
321            make_char("b", 34.0, 100.0, 42.0, 112.0),
322        ];
323        let opts = SearchOptions {
324            regex: false,
325            ..Default::default()
326        };
327        let matches = search_chars(&chars, "ab", &opts, 0);
328
329        assert_eq!(matches.len(), 2);
330        assert_eq!(matches[0].text, "ab");
331        assert_eq!(matches[0].char_indices, vec![0, 1]);
332        assert_eq!(matches[1].text, "ab");
333        assert_eq!(matches[1].char_indices, vec![2, 3]);
334    }
335
336    #[test]
337    fn multiline_match_bbox() {
338        // Chars on different lines — match spanning them should have union bbox
339        let chars = vec![
340            make_char("A", 10.0, 100.0, 20.0, 112.0),
341            make_char("B", 20.0, 100.0, 30.0, 112.0),
342            make_char("C", 10.0, 120.0, 20.0, 132.0),
343        ];
344        let opts = SearchOptions {
345            regex: false,
346            ..Default::default()
347        };
348        let matches = search_chars(&chars, "ABC", &opts, 0);
349
350        assert_eq!(matches.len(), 1);
351        assert_eq!(matches[0].text, "ABC");
352        // Union: x0=10, top=100, x1=30, bottom=132
353        assert_eq!(matches[0].bbox, BBox::new(10.0, 100.0, 30.0, 132.0));
354    }
355
356    #[test]
357    fn invalid_regex_returns_empty() {
358        let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
359        let opts = SearchOptions {
360            regex: true,
361            ..Default::default()
362        };
363        let matches = search_chars(&chars, "[invalid", &opts, 0);
364        assert!(matches.is_empty());
365    }
366
367    #[test]
368    fn regex_case_insensitive() {
369        let chars = vec![
370            make_char("H", 10.0, 100.0, 18.0, 112.0),
371            make_char("i", 18.0, 100.0, 26.0, 112.0),
372        ];
373        let opts = SearchOptions {
374            regex: true,
375            case_sensitive: false,
376        };
377        let matches = search_chars(&chars, "h.", &opts, 0);
378
379        assert_eq!(matches.len(), 1);
380        assert_eq!(matches[0].text, "Hi");
381    }
382
383    #[test]
384    fn page_number_in_result() {
385        let chars = vec![
386            make_char("A", 10.0, 100.0, 20.0, 112.0),
387            make_char("B", 20.0, 100.0, 30.0, 112.0),
388        ];
389        let opts = SearchOptions {
390            regex: false,
391            ..Default::default()
392        };
393        let matches = search_chars(&chars, "AB", &opts, 5);
394
395        assert_eq!(matches.len(), 1);
396        assert_eq!(matches[0].page_number, 5);
397    }
398}