1#![deny(missing_docs)]
32
33pub mod engine;
34pub mod error;
35pub mod pdf;
36
37pub use error::PdfGrepError;
38
39use std::path::{Path, PathBuf};
40
41#[non_exhaustive]
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub struct Match {
48 pub path: PathBuf,
50 pub page: u32,
52 pub text: String,
56 pub byte_span: (usize, usize),
58}
59
60pub struct PdfGrep {
62 engine: engine::Engine,
63 invert_match: bool,
64 only_matching: bool,
65 max_count: Option<usize>,
66 page_range: Option<(u32, u32)>,
67 passwords: Vec<String>,
68}
69
70impl PdfGrep {
71 pub fn search_file<'a>(&'a self, path: &Path) -> PageIterator<'a> {
77 PageIterator::new(self, path.to_path_buf())
78 }
79
80 pub fn search_file_collected(&self, path: &Path) -> Result<Vec<Match>, PdfGrepError> {
83 self.search_file(path).collect()
84 }
85
86 #[must_use]
88 pub fn invert_match(&self) -> bool {
89 self.invert_match
90 }
91
92 #[must_use]
94 pub fn only_matching(&self) -> bool {
95 self.only_matching
96 }
97
98 #[must_use]
100 pub fn max_count(&self) -> Option<usize> {
101 self.max_count
102 }
103
104 #[must_use]
106 pub fn page_range(&self) -> Option<(u32, u32)> {
107 self.page_range
108 }
109
110 #[must_use]
112 pub fn passwords(&self) -> &[String] {
113 &self.passwords
114 }
115}
116
117impl std::fmt::Debug for PdfGrep {
118 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119 f.debug_struct("PdfGrep")
120 .field("invert_match", &self.invert_match)
121 .field("only_matching", &self.only_matching)
122 .field("max_count", &self.max_count)
123 .field("page_range", &self.page_range)
124 .field("passwords", &format!("<{} entries>", self.passwords.len()))
125 .finish()
126 }
127}
128
129#[derive(Debug, Clone, Default)]
133pub struct PdfGrepBuilder {
134 pattern: Option<String>,
135 fixed_strings: bool,
136 perl_regexp: bool,
137 case_insensitive: bool,
138 invert_match: bool,
139 only_matching: bool,
140 max_count: Option<usize>,
141 page_range: Option<(u32, u32)>,
142 passwords: Vec<String>,
143}
144
145impl PdfGrepBuilder {
146 #[must_use]
148 pub fn new() -> Self {
149 Self::default()
150 }
151
152 #[must_use]
154 pub fn pattern(mut self, p: impl Into<String>) -> Self {
155 self.pattern = Some(p.into());
156 self
157 }
158
159 #[must_use]
161 pub fn fixed_strings(mut self, on: bool) -> Self {
162 self.fixed_strings = on;
163 self
164 }
165
166 #[must_use]
168 pub fn perl_regexp(mut self, on: bool) -> Self {
169 self.perl_regexp = on;
170 self
171 }
172
173 #[must_use]
175 pub fn case_insensitive(mut self, on: bool) -> Self {
176 self.case_insensitive = on;
177 self
178 }
179
180 #[must_use]
182 pub fn invert_match(mut self, on: bool) -> Self {
183 self.invert_match = on;
184 self
185 }
186
187 #[must_use]
189 pub fn only_matching(mut self, on: bool) -> Self {
190 self.only_matching = on;
191 self
192 }
193
194 #[must_use]
196 pub fn max_count(mut self, n: Option<usize>) -> Self {
197 self.max_count = n;
198 self
199 }
200
201 #[must_use]
203 pub fn page_range(mut self, range: Option<(u32, u32)>) -> Self {
204 self.page_range = range;
205 self
206 }
207
208 #[must_use]
211 pub fn password(mut self, pwd: impl Into<String>) -> Self {
212 self.passwords.push(pwd.into());
213 self
214 }
215
216 pub fn build(self) -> Result<PdfGrep, PdfGrepError> {
225 let pattern = self.pattern.unwrap_or_default();
226 let engine = engine::compile(
227 &pattern,
228 self.fixed_strings,
229 self.perl_regexp,
230 self.case_insensitive,
231 )?;
232 if let Some((start, end)) = self.page_range {
233 if start > end {
234 return Err(PdfGrepError::PageRange {
235 value: format!("{start}-{end}"),
236 });
237 }
238 }
239 Ok(PdfGrep {
240 engine,
241 invert_match: self.invert_match,
242 only_matching: self.only_matching,
243 max_count: self.max_count,
244 page_range: self.page_range,
245 passwords: self.passwords,
246 })
247 }
248}
249
250pub struct PageIterator<'a> {
257 grep: &'a PdfGrep,
258 path: PathBuf,
259 doc: Option<pdf::PdfDocument>,
260 init_error: Option<PdfGrepError>,
261 page_idx: usize,
262 page_numbers: Vec<u32>,
263 current_text: Option<String>,
264 current_matches: Vec<(usize, usize)>,
265 current_match_idx: usize,
266 yielded: usize,
267 started: bool,
268}
269
270impl<'a> PageIterator<'a> {
271 fn new(grep: &'a PdfGrep, path: PathBuf) -> Self {
272 PageIterator {
273 grep,
274 path,
275 doc: None,
276 init_error: None,
277 page_idx: 0,
278 page_numbers: Vec::new(),
279 current_text: None,
280 current_matches: Vec::new(),
281 current_match_idx: 0,
282 yielded: 0,
283 started: false,
284 }
285 }
286
287 fn ensure_started(&mut self) {
288 if self.started {
289 return;
290 }
291 self.started = true;
292 match pdf::PdfDocument::open(&self.path, &self.grep.passwords) {
293 Ok(doc) => {
294 let mut nums: Vec<u32> = doc.page_numbers().to_vec();
295 if let Some((start, end)) = self.grep.page_range {
296 nums.retain(|&n| n >= start && n <= end);
297 }
298 self.page_numbers = nums;
299 self.doc = Some(doc);
300 }
301 Err(e) => {
302 self.init_error = Some(e);
303 }
304 }
305 }
306}
307
308impl Iterator for PageIterator<'_> {
309 type Item = Result<Match, PdfGrepError>;
310
311 fn next(&mut self) -> Option<Self::Item> {
312 self.ensure_started();
313 if let Some(err) = self.init_error.take() {
314 return Some(Err(err));
315 }
316 let doc = self.doc.as_ref()?;
317
318 if let Some(cap) = self.grep.max_count {
320 if self.yielded >= cap {
321 return None;
322 }
323 }
324
325 loop {
326 if let Some(text) = &self.current_text {
328 if self.current_match_idx < self.current_matches.len() {
329 let (start, end) = self.current_matches[self.current_match_idx];
330 self.current_match_idx += 1;
331 let line = containing_line(text, start, end);
333 let (line_start, line_end) = line;
334 let line_text = text[line_start..line_end].to_string();
335 let span_in_line = (start - line_start, end - line_start);
336 let m = Match {
337 path: self.path.clone(),
338 page: self
339 .page_numbers
340 .get(self.page_idx - 1)
341 .copied()
342 .unwrap_or(0),
343 text: if self.grep.only_matching {
344 text[start..end].to_string()
345 } else {
346 line_text
347 },
348 byte_span: if self.grep.only_matching {
349 (0, end - start)
350 } else {
351 span_in_line
352 },
353 };
354 self.yielded += 1;
355 return Some(Ok(m));
356 }
357 self.current_text = None;
359 self.current_matches.clear();
360 self.current_match_idx = 0;
361 }
362
363 if self.page_idx >= self.page_numbers.len() {
365 return None;
366 }
367 let page = self.page_numbers[self.page_idx];
368 self.page_idx += 1;
369 match doc.extract_page(page) {
370 Ok(text) => {
371 let matches = self.grep.engine.find_all(&text);
372 if self.grep.invert_match {
373 if matches.is_empty() && !text.is_empty() {
377 let m = Match {
379 path: self.path.clone(),
380 page,
381 text: text.clone(),
382 byte_span: (0, 0),
383 };
384 self.yielded += 1;
385 return Some(Ok(m));
386 }
387 continue;
388 }
389 self.current_text = Some(text);
390 self.current_matches = matches;
391 self.current_match_idx = 0;
392 }
393 Err(msg) => {
394 eprintln!("rusty-pdfgrep: {}: {msg}", self.path.display());
395 continue;
396 }
397 }
398 }
399 }
400}
401
402fn containing_line(text: &str, match_start: usize, match_end: usize) -> (usize, usize) {
406 let line_start = text[..match_start].rfind('\n').map(|i| i + 1).unwrap_or(0);
407 let line_end = text[match_end..]
408 .find('\n')
409 .map(|i| match_end + i)
410 .unwrap_or(text.len());
411 (line_start, line_end)
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417 use static_assertions::assert_impl_all;
418
419 assert_impl_all!(PdfGrep: Send);
420 assert_impl_all!(PdfGrepBuilder: Send, Sync);
421 assert_impl_all!(Match: Send, Sync);
422 assert_impl_all!(PdfGrepError: Send, Sync);
423
424 #[test]
425 fn builder_requires_no_pattern_to_build() {
426 let g = PdfGrepBuilder::new().build();
428 assert!(g.is_ok());
429 }
430
431 #[test]
432 fn builder_invalid_regex_returns_err() {
433 let err = PdfGrepBuilder::new()
434 .pattern("[invalid")
435 .build()
436 .unwrap_err();
437 assert!(matches!(err, PdfGrepError::RegexCompile { .. }));
438 }
439
440 #[test]
441 fn builder_reverse_page_range_returns_err() {
442 let err = PdfGrepBuilder::new()
443 .pattern("x")
444 .page_range(Some((5, 3)))
445 .build()
446 .unwrap_err();
447 assert!(matches!(err, PdfGrepError::PageRange { .. }));
448 }
449
450 #[test]
451 fn builder_password_appends_in_order() {
452 let g = PdfGrepBuilder::new()
453 .pattern("x")
454 .password("a")
455 .password("b")
456 .password("c")
457 .build()
458 .unwrap();
459 assert_eq!(g.passwords(), &["a", "b", "c"]);
460 }
461
462 #[test]
463 fn containing_line_extracts_correctly() {
464 let text = "first line\nsecond match here\nthird line";
465 let (s, e) = containing_line(text, 18, 23);
466 assert_eq!(&text[s..e], "second match here");
467 }
468
469 #[test]
470 fn containing_line_no_newlines_returns_full_text() {
471 let text = "single line no newlines";
472 let (s, e) = containing_line(text, 7, 11);
473 assert_eq!((s, e), (0, text.len()));
474 }
475}