Skip to main content

folio_pdf/text/
search.rs

1//! PDF text search — find text patterns across document pages.
2//!
3//! Supports literal and regex search with configurable modes.
4
5use super::TextExtractor;
6use crate::core::{FolioError, Result};
7use crate::cos::CosDoc;
8use crate::doc::PdfDoc;
9use regex::Regex;
10
11/// Search mode flags.
12#[derive(Debug, Clone)]
13pub struct SearchOptions {
14    /// Use regex pattern matching instead of literal string search.
15    pub regex: bool,
16    /// Case-sensitive search (default: false = case-insensitive).
17    pub case_sensitive: bool,
18    /// Match whole words only.
19    pub whole_word: bool,
20    /// First page to search (1-based, default: 1).
21    pub start_page: u32,
22    /// Last page to search (inclusive, default: last page). 0 = all pages.
23    pub end_page: u32,
24    /// Maximum number of results to return (0 = unlimited).
25    pub max_results: usize,
26}
27
28impl Default for SearchOptions {
29    fn default() -> Self {
30        Self {
31            regex: false,
32            case_sensitive: false,
33            whole_word: false,
34            start_page: 1,
35            end_page: 0,
36            max_results: 0,
37        }
38    }
39}
40
41impl SearchOptions {
42    pub fn new() -> Self {
43        Self::default()
44    }
45
46    pub fn regex(mut self, enabled: bool) -> Self {
47        self.regex = enabled;
48        self
49    }
50
51    pub fn case_sensitive(mut self, enabled: bool) -> Self {
52        self.case_sensitive = enabled;
53        self
54    }
55
56    pub fn whole_word(mut self, enabled: bool) -> Self {
57        self.whole_word = enabled;
58        self
59    }
60
61    pub fn pages(mut self, start: u32, end: u32) -> Self {
62        self.start_page = start;
63        self.end_page = end;
64        self
65    }
66
67    pub fn max_results(mut self, max: usize) -> Self {
68        self.max_results = max;
69        self
70    }
71}
72
73/// A single search result.
74#[derive(Debug, Clone)]
75pub struct SearchResult {
76    /// 1-based page number where the match was found.
77    pub page_num: u32,
78    /// The matched text.
79    pub match_text: String,
80    /// Byte offset of the match within the page's extracted text.
81    pub offset: usize,
82    /// Text surrounding the match for context.
83    pub context: String,
84}
85
86/// Search for text across a PDF document.
87pub struct TextSearch;
88
89impl TextSearch {
90    /// Search for a pattern in a document.
91    ///
92    /// Returns all matches across the specified page range.
93    pub fn search(
94        doc: &mut PdfDoc,
95        pattern: &str,
96        options: &SearchOptions,
97    ) -> Result<Vec<SearchResult>> {
98        let compiled = compile_pattern(pattern, options)?;
99        let page_count = doc.page_count()?;
100
101        let start = options.start_page.max(1);
102        let end = if options.end_page == 0 || options.end_page > page_count {
103            page_count
104        } else {
105            options.end_page
106        };
107
108        let mut results = Vec::new();
109
110        for page_num in start..=end {
111            let page = match doc.get_page(page_num) {
112                Ok(p) => p,
113                Err(_) => continue,
114            };
115
116            let text = match TextExtractor::extract_from_page(&page, doc.cos_mut()) {
117                Ok(t) => t,
118                Err(_) => continue,
119            };
120
121            if text.is_empty() {
122                continue;
123            }
124
125            for mat in compiled.find_iter(&text) {
126                let match_text = mat.as_str().to_string();
127                let offset = mat.start();
128
129                // Build context: ~40 chars before and after
130                let ctx_start = text[..offset]
131                    .char_indices()
132                    .rev()
133                    .nth(40)
134                    .map(|(i, _)| i)
135                    .unwrap_or(0);
136                let ctx_end = text[mat.end()..]
137                    .char_indices()
138                    .nth(40)
139                    .map(|(i, _)| mat.end() + i)
140                    .unwrap_or(text.len());
141                let context = text[ctx_start..ctx_end].to_string();
142
143                results.push(SearchResult {
144                    page_num,
145                    match_text,
146                    offset,
147                    context,
148                });
149
150                if options.max_results > 0 && results.len() >= options.max_results {
151                    return Ok(results);
152                }
153            }
154        }
155
156        Ok(results)
157    }
158
159    /// Quick check: does the pattern appear anywhere in the document?
160    pub fn contains(doc: &mut PdfDoc, pattern: &str) -> Result<bool> {
161        let options = SearchOptions::new().max_results(1);
162        let results = Self::search(doc, pattern, &options)?;
163        Ok(!results.is_empty())
164    }
165
166    /// Count total occurrences of pattern across all pages.
167    pub fn count(doc: &mut PdfDoc, pattern: &str) -> Result<usize> {
168        let results = Self::search(doc, pattern, &SearchOptions::new())?;
169        Ok(results.len())
170    }
171
172    /// Search with regex pattern.
173    pub fn search_regex(
174        doc: &mut PdfDoc,
175        pattern: &str,
176        options: &SearchOptions,
177    ) -> Result<Vec<SearchResult>> {
178        let mut opts = options.clone();
179        opts.regex = true;
180        Self::search(doc, pattern, &opts)
181    }
182}
183
184/// Compile the search pattern into a regex.
185fn compile_pattern(pattern: &str, options: &SearchOptions) -> Result<Regex> {
186    let regex_pattern = if options.regex {
187        pattern.to_string()
188    } else {
189        // Escape regex special characters for literal search
190        regex::escape(pattern)
191    };
192
193    // Apply whole-word matching
194    let regex_pattern = if options.whole_word {
195        format!(r"\b{}\b", regex_pattern)
196    } else {
197        regex_pattern
198    };
199
200    // Apply case sensitivity
201    let regex_pattern = if options.case_sensitive {
202        regex_pattern
203    } else {
204        format!("(?i){}", regex_pattern)
205    };
206
207    Regex::new(&regex_pattern)
208        .map_err(|e| FolioError::InvalidArgument(format!("Invalid search pattern: {}", e)))
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_compile_literal() {
217        let opts = SearchOptions::new();
218        let re = compile_pattern("hello world", &opts).unwrap();
219        assert!(re.is_match("HELLO WORLD")); // case insensitive by default
220        assert!(re.is_match("hello world"));
221    }
222
223    #[test]
224    fn test_compile_case_sensitive() {
225        let opts = SearchOptions::new().case_sensitive(true);
226        let re = compile_pattern("Hello", &opts).unwrap();
227        assert!(re.is_match("Hello"));
228        assert!(!re.is_match("hello"));
229    }
230
231    #[test]
232    fn test_compile_whole_word() {
233        let opts = SearchOptions::new().whole_word(true);
234        let re = compile_pattern("the", &opts).unwrap();
235        assert!(re.is_match("the cat"));
236        assert!(!re.is_match("other"));
237    }
238
239    #[test]
240    fn test_compile_regex() {
241        let opts = SearchOptions::new().regex(true);
242        let re = compile_pattern(r"\d{3}-\d{4}", &opts).unwrap();
243        assert!(re.is_match("Call 555-1234 now"));
244        assert!(!re.is_match("no numbers here"));
245    }
246
247    #[test]
248    fn test_escape_special_chars() {
249        let opts = SearchOptions::new();
250        let re = compile_pattern("price: $10.00", &opts).unwrap();
251        assert!(re.is_match("The price: $10.00 is final"));
252    }
253}