Skip to main content

rusty_pdfgrep/
lib.rs

1//! # rusty-pdfgrep
2//!
3//! A Rust port of Hans-Peter Deifel's `pdfgrep(1)` — grep through PDF files
4//! using page-level text extraction and pluggable regex engines.
5//!
6//! ## Quick start
7//!
8//! ```no_run
9//! use rusty_pdfgrep::PdfGrepBuilder;
10//! use std::path::Path;
11//!
12//! let pdfgrep = PdfGrepBuilder::new()
13//!     .pattern("force majeure")
14//!     .case_insensitive(true)
15//!     .build()
16//!     .unwrap();
17//!
18//! for result in pdfgrep.search_file(Path::new("contract.pdf")) {
19//!     let m = result.unwrap();
20//!     println!("{}:{}: {}", m.path.display(), m.page, m.text);
21//! }
22//! ```
23//!
24//! ## Stability
25//!
26//! Library and binary share a single crate version. `lopdf` is pinned to the
27//! 0.36 minor; `regex` + `fancy-regex` engines are SemVer-stable. The
28//! `PdfGrepError` and `Match` types are `#[non_exhaustive]` — downstream code
29//! MUST use a wildcard `_` arm when matching.
30
31#![deny(missing_docs)]
32
33pub mod engine;
34pub mod error;
35pub mod pdf;
36
37pub use error::PdfGrepError;
38
39use std::path::{Path, PathBuf};
40
41/// A single matched occurrence in a PDF page (FR-040).
42///
43/// `byte_span` indexes into `Match.text` — slicing `&text[byte_span.0..byte_span.1]`
44/// yields the matched substring. UTF-8 codepoint boundaries are aligned.
45#[non_exhaustive]
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub struct Match {
48    /// Source path of the PDF containing this match.
49    pub path: PathBuf,
50    /// 1-indexed page number where the match was found.
51    pub page: u32,
52    /// Full containing line of extracted text (the substring of page text
53    /// between adjacent line breaks). When `-o`/`only_matching` is set on the
54    /// runner, `text` is the matched span only.
55    pub text: String,
56    /// `(start, end)` byte offsets within `Match.text` for the matched substring.
57    pub byte_span: (usize, usize),
58}
59
60/// Configured pattern matcher. Construct via [`PdfGrepBuilder`].
61pub struct PdfGrep {
62    engine: engine::Engine,
63    invert_match: bool,
64    only_matching: bool,
65    max_count: Option<usize>,
66    page_range: Option<(u32, u32)>,
67    passwords: Vec<String>,
68}
69
70impl PdfGrep {
71    /// Search a single PDF file. Returns an iterator yielding matches lazily,
72    /// one page of extraction work per `.next()` call (FR-042).
73    ///
74    /// Peak memory is bounded by `O(one page of text + match buffer)` —
75    /// never `O(whole document)`.
76    pub fn search_file<'a>(&'a self, path: &Path) -> PageIterator<'a> {
77        PageIterator::new(self, path.to_path_buf())
78    }
79
80    /// Convenience: run [`search_file`](Self::search_file) and collect into a
81    /// `Vec<Match>`. Eager; for large documents, prefer the iterator.
82    pub fn search_file_collected(&self, path: &Path) -> Result<Vec<Match>, PdfGrepError> {
83        self.search_file(path).collect()
84    }
85
86    /// True when `-v`/`--invert-match` is active.
87    #[must_use]
88    pub fn invert_match(&self) -> bool {
89        self.invert_match
90    }
91
92    /// True when `-o`/`--only-matching` is active.
93    #[must_use]
94    pub fn only_matching(&self) -> bool {
95        self.only_matching
96    }
97
98    /// Configured `-m N` cap.
99    #[must_use]
100    pub fn max_count(&self) -> Option<usize> {
101        self.max_count
102    }
103
104    /// Configured `--page-range N-M`.
105    #[must_use]
106    pub fn page_range(&self) -> Option<(u32, u32)> {
107        self.page_range
108    }
109
110    /// Configured `--password PWD` list (in flag order).
111    #[must_use]
112    pub fn passwords(&self) -> &[String] {
113        &self.passwords
114    }
115}
116
117impl std::fmt::Debug for PdfGrep {
118    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
119        f.debug_struct("PdfGrep")
120            .field("invert_match", &self.invert_match)
121            .field("only_matching", &self.only_matching)
122            .field("max_count", &self.max_count)
123            .field("page_range", &self.page_range)
124            .field("passwords", &format!("<{} entries>", self.passwords.len()))
125            .finish()
126    }
127}
128
129/// Builder for [`PdfGrep`] (FR-039). All methods are independent and
130/// order-agnostic; `password(...)` appends to the retry list and is the only
131/// repeatable setter.
132#[derive(Debug, Clone, Default)]
133pub struct PdfGrepBuilder {
134    pattern: Option<String>,
135    fixed_strings: bool,
136    perl_regexp: bool,
137    case_insensitive: bool,
138    invert_match: bool,
139    only_matching: bool,
140    max_count: Option<usize>,
141    page_range: Option<(u32, u32)>,
142    passwords: Vec<String>,
143}
144
145impl PdfGrepBuilder {
146    /// Fresh builder with all defaults applied.
147    #[must_use]
148    pub fn new() -> Self {
149        Self::default()
150    }
151
152    /// Required pattern (PCRE or fixed string per `fixed_strings`/`perl_regexp`).
153    #[must_use]
154    pub fn pattern(mut self, p: impl Into<String>) -> Self {
155        self.pattern = Some(p.into());
156        self
157    }
158
159    /// `-F`/`--fixed-strings` — escape metacharacters in `pattern`.
160    #[must_use]
161    pub fn fixed_strings(mut self, on: bool) -> Self {
162        self.fixed_strings = on;
163        self
164    }
165
166    /// `-P`/`--perl-regexp` — use `fancy-regex` engine instead of `regex`.
167    #[must_use]
168    pub fn perl_regexp(mut self, on: bool) -> Self {
169        self.perl_regexp = on;
170        self
171    }
172
173    /// `-i`/`--ignore-case`.
174    #[must_use]
175    pub fn case_insensitive(mut self, on: bool) -> Self {
176        self.case_insensitive = on;
177        self
178    }
179
180    /// `-v`/`--invert-match`.
181    #[must_use]
182    pub fn invert_match(mut self, on: bool) -> Self {
183        self.invert_match = on;
184        self
185    }
186
187    /// `-o`/`--only-matching` — `Match.text` is the matched span only.
188    #[must_use]
189    pub fn only_matching(mut self, on: bool) -> Self {
190        self.only_matching = on;
191        self
192    }
193
194    /// `-m N`/`--max-count` — stop after N matches.
195    #[must_use]
196    pub fn max_count(mut self, n: Option<usize>) -> Self {
197        self.max_count = n;
198        self
199    }
200
201    /// `--page-range N-M` (1-indexed inclusive).
202    #[must_use]
203    pub fn page_range(mut self, range: Option<(u32, u32)>) -> Self {
204        self.page_range = range;
205        self
206    }
207
208    /// `--password PWD` — repeatable; each call APPENDS to the retry list
209    /// (FR-025, FR-039 + Clarifications Q4).
210    #[must_use]
211    pub fn password(mut self, pwd: impl Into<String>) -> Self {
212        self.passwords.push(pwd.into());
213        self
214    }
215
216    /// Build a configured [`PdfGrep`]. FALLIBLE — regex compile failure maps
217    /// to `PdfGrepError::RegexCompile`; invalid `page_range` maps to
218    /// `PdfGrepError::PageRange`.
219    ///
220    /// # Errors
221    ///
222    /// - `PdfGrepError::RegexCompile` if the pattern fails to compile.
223    /// - `PdfGrepError::PageRange` if the page range is reverse (start > end).
224    pub fn build(self) -> Result<PdfGrep, PdfGrepError> {
225        let pattern = self.pattern.unwrap_or_default();
226        let engine = engine::compile(
227            &pattern,
228            self.fixed_strings,
229            self.perl_regexp,
230            self.case_insensitive,
231        )?;
232        if let Some((start, end)) = self.page_range {
233            if start > end {
234                return Err(PdfGrepError::PageRange {
235                    value: format!("{start}-{end}"),
236                });
237            }
238        }
239        Ok(PdfGrep {
240            engine,
241            invert_match: self.invert_match,
242            only_matching: self.only_matching,
243            max_count: self.max_count,
244            page_range: self.page_range,
245            passwords: self.passwords,
246        })
247    }
248}
249
250/// Lazy per-page iterator returned by [`PdfGrep::search_file`].
251///
252/// State machine: opens the PDF on first `.next()`, then for each page in
253/// the configured range extracts text, finds matches, and yields them one
254/// at a time. Advances to the next page only when the current page's
255/// matches are exhausted.
256pub struct PageIterator<'a> {
257    grep: &'a PdfGrep,
258    path: PathBuf,
259    doc: Option<pdf::PdfDocument>,
260    init_error: Option<PdfGrepError>,
261    page_idx: usize,
262    page_numbers: Vec<u32>,
263    current_text: Option<String>,
264    current_matches: Vec<(usize, usize)>,
265    current_match_idx: usize,
266    yielded: usize,
267    started: bool,
268}
269
270impl<'a> PageIterator<'a> {
271    fn new(grep: &'a PdfGrep, path: PathBuf) -> Self {
272        PageIterator {
273            grep,
274            path,
275            doc: None,
276            init_error: None,
277            page_idx: 0,
278            page_numbers: Vec::new(),
279            current_text: None,
280            current_matches: Vec::new(),
281            current_match_idx: 0,
282            yielded: 0,
283            started: false,
284        }
285    }
286
287    fn ensure_started(&mut self) {
288        if self.started {
289            return;
290        }
291        self.started = true;
292        match pdf::PdfDocument::open(&self.path, &self.grep.passwords) {
293            Ok(doc) => {
294                let mut nums: Vec<u32> = doc.page_numbers().to_vec();
295                if let Some((start, end)) = self.grep.page_range {
296                    nums.retain(|&n| n >= start && n <= end);
297                }
298                self.page_numbers = nums;
299                self.doc = Some(doc);
300            }
301            Err(e) => {
302                self.init_error = Some(e);
303            }
304        }
305    }
306}
307
308impl Iterator for PageIterator<'_> {
309    type Item = Result<Match, PdfGrepError>;
310
311    fn next(&mut self) -> Option<Self::Item> {
312        self.ensure_started();
313        if let Some(err) = self.init_error.take() {
314            return Some(Err(err));
315        }
316        let doc = self.doc.as_ref()?;
317
318        // Respect -m N cap.
319        if let Some(cap) = self.grep.max_count {
320            if self.yielded >= cap {
321                return None;
322            }
323        }
324
325        loop {
326            // Try to yield a remaining match from the current page.
327            if let Some(text) = &self.current_text {
328                if self.current_match_idx < self.current_matches.len() {
329                    let (start, end) = self.current_matches[self.current_match_idx];
330                    self.current_match_idx += 1;
331                    // Construct the Match per FR-007/FR-040: containing line.
332                    let line = containing_line(text, start, end);
333                    let (line_start, line_end) = line;
334                    let line_text = text[line_start..line_end].to_string();
335                    let span_in_line = (start - line_start, end - line_start);
336                    let m = Match {
337                        path: self.path.clone(),
338                        page: self
339                            .page_numbers
340                            .get(self.page_idx - 1)
341                            .copied()
342                            .unwrap_or(0),
343                        text: if self.grep.only_matching {
344                            text[start..end].to_string()
345                        } else {
346                            line_text
347                        },
348                        byte_span: if self.grep.only_matching {
349                            (0, end - start)
350                        } else {
351                            span_in_line
352                        },
353                    };
354                    self.yielded += 1;
355                    return Some(Ok(m));
356                }
357                // Page exhausted; advance.
358                self.current_text = None;
359                self.current_matches.clear();
360                self.current_match_idx = 0;
361            }
362
363            // Advance to next page.
364            if self.page_idx >= self.page_numbers.len() {
365                return None;
366            }
367            let page = self.page_numbers[self.page_idx];
368            self.page_idx += 1;
369            match doc.extract_page(page) {
370                Ok(text) => {
371                    let matches = self.grep.engine.find_all(&text);
372                    if self.grep.invert_match {
373                        // -v semantics: emit lines that DON'T match. v0.1.0
374                        // simplification: skip the page if any match exists;
375                        // refine in iter-2 to per-line inversion.
376                        if matches.is_empty() && !text.is_empty() {
377                            // Emit one "page-as-line" Match with empty span.
378                            let m = Match {
379                                path: self.path.clone(),
380                                page,
381                                text: text.clone(),
382                                byte_span: (0, 0),
383                            };
384                            self.yielded += 1;
385                            return Some(Ok(m));
386                        }
387                        continue;
388                    }
389                    self.current_text = Some(text);
390                    self.current_matches = matches;
391                    self.current_match_idx = 0;
392                }
393                Err(msg) => {
394                    eprintln!("rusty-pdfgrep: {}: {msg}", self.path.display());
395                    continue;
396                }
397            }
398        }
399    }
400}
401
402/// Find the byte range `[start, end)` of the line containing the given match
403/// span in `text`. Lines are delimited by `\n`; the trailing `\n` is excluded
404/// from the returned slice.
405fn containing_line(text: &str, match_start: usize, match_end: usize) -> (usize, usize) {
406    let line_start = text[..match_start].rfind('\n').map(|i| i + 1).unwrap_or(0);
407    let line_end = text[match_end..]
408        .find('\n')
409        .map(|i| match_end + i)
410        .unwrap_or(text.len());
411    (line_start, line_end)
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417    use static_assertions::assert_impl_all;
418
419    assert_impl_all!(PdfGrep: Send);
420    assert_impl_all!(PdfGrepBuilder: Send, Sync);
421    assert_impl_all!(Match: Send, Sync);
422    assert_impl_all!(PdfGrepError: Send, Sync);
423
424    #[test]
425    fn builder_requires_no_pattern_to_build() {
426        // Empty pattern compiles to a regex that matches everywhere; not an error.
427        let g = PdfGrepBuilder::new().build();
428        assert!(g.is_ok());
429    }
430
431    #[test]
432    fn builder_invalid_regex_returns_err() {
433        let err = PdfGrepBuilder::new()
434            .pattern("[invalid")
435            .build()
436            .unwrap_err();
437        assert!(matches!(err, PdfGrepError::RegexCompile { .. }));
438    }
439
440    #[test]
441    fn builder_reverse_page_range_returns_err() {
442        let err = PdfGrepBuilder::new()
443            .pattern("x")
444            .page_range(Some((5, 3)))
445            .build()
446            .unwrap_err();
447        assert!(matches!(err, PdfGrepError::PageRange { .. }));
448    }
449
450    #[test]
451    fn builder_password_appends_in_order() {
452        let g = PdfGrepBuilder::new()
453            .pattern("x")
454            .password("a")
455            .password("b")
456            .password("c")
457            .build()
458            .unwrap();
459        assert_eq!(g.passwords(), &["a", "b", "c"]);
460    }
461
462    #[test]
463    fn containing_line_extracts_correctly() {
464        let text = "first line\nsecond match here\nthird line";
465        let (s, e) = containing_line(text, 18, 23);
466        assert_eq!(&text[s..e], "second match here");
467    }
468
469    #[test]
470    fn containing_line_no_newlines_returns_full_text() {
471        let text = "single line no newlines";
472        let (s, e) = containing_line(text, 7, 11);
473        assert_eq!((s, e), (0, text.len()));
474    }
475}