rusty-pdfgrep 0.1.0

Grep through PDF files — a Rust port of Hans-Peter Deifel's `pdfgrep(1)` with lopdf-backed text extraction, regex + fancy-regex pluggable engines, --password retry for encrypted PDFs, GNU-grep-compatible color output, recursive walking with fnmatch include/exclude, and a typed library API.
Documentation
//! Pluggable regex engine for pattern matching (FR-001..FR-005, AD-005).
//!
//! Default: `regex` crate (RE2-style, linear-time, no lookaround).
//! `-F`/`--fixed-strings`: `regex::escape` then compile as a literal.
//! `-P`/`--perl-regexp`: `fancy-regex` (pure-Rust PCRE-compat).

use crate::error::PdfGrepError;

/// Selected match engine. Constructed once at startup via [`compile`].
#[derive(Debug)]
pub enum Engine {
    /// Default RE2-style engine (`regex` crate).
    Regex(regex::Regex),
    /// PCRE-compatible engine (`fancy-regex` crate). Activated by `-P`.
    Fancy(fancy_regex::Regex),
}

/// Compile a pattern into an [`Engine`].
///
/// # Errors
///
/// Returns [`PdfGrepError::RegexCompile`] on a malformed pattern.
pub fn compile(
    pattern: &str,
    fixed_strings: bool,
    perl_regexp: bool,
    case_insensitive: bool,
) -> Result<Engine, PdfGrepError> {
    let prepared = if fixed_strings {
        regex::escape(pattern)
    } else {
        pattern.to_string()
    };

    if perl_regexp {
        let raw = if case_insensitive {
            format!("(?i){prepared}")
        } else {
            prepared
        };
        fancy_regex::Regex::new(&raw)
            .map(Engine::Fancy)
            .map_err(|e| PdfGrepError::RegexCompile {
                pattern: pattern.to_string(),
                message: e.to_string(),
            })
    } else {
        regex::RegexBuilder::new(&prepared)
            .case_insensitive(case_insensitive)
            .build()
            .map(Engine::Regex)
            .map_err(|e| PdfGrepError::RegexCompile {
                pattern: pattern.to_string(),
                message: e.to_string(),
            })
    }
}

impl Engine {
    /// Find all matches in `text`. Returns `Vec<(start, end)>` byte spans
    /// (within `text`). Allocations are minimized — caller can iterate the
    /// returned vector or pass it to the formatter.
    #[must_use]
    pub fn find_all(&self, text: &str) -> Vec<(usize, usize)> {
        match self {
            Engine::Regex(r) => r.find_iter(text).map(|m| (m.start(), m.end())).collect(),
            Engine::Fancy(r) => {
                let mut out = Vec::new();
                let mut start = 0;
                while start <= text.len() {
                    match r.find_from_pos(text, start) {
                        Ok(Some(m)) => {
                            let s = m.start();
                            let e = m.end();
                            out.push((s, e));
                            start = if e == s { e + 1 } else { e };
                        }
                        Ok(None) => break,
                        Err(_) => break,
                    }
                }
                out
            }
        }
    }

    /// Quick yes/no check; used by `-q`, `-l`, `-L`.
    #[must_use]
    pub fn is_match(&self, text: &str) -> bool {
        match self {
            Engine::Regex(r) => r.is_match(text),
            Engine::Fancy(r) => r.is_match(text).unwrap_or(false),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn regex_default_engine() {
        let e = compile("foo+", false, false, false).unwrap();
        assert!(matches!(e, Engine::Regex(_)));
        assert_eq!(e.find_all("xfoooy"), vec![(1, 5)]);
    }

    #[test]
    fn fixed_strings_escapes_metacharacters() {
        // `(group)` as fixed string should match literally, NOT as regex group.
        let e = compile("(group)", true, false, false).unwrap();
        assert!(e.is_match("here is (group) literally"));
        assert!(!e.is_match("here is group without parens"));
    }

    #[test]
    fn perl_regexp_lookahead() {
        let e = compile("foo(?=bar)", false, true, false).unwrap();
        assert!(matches!(e, Engine::Fancy(_)));
        assert!(e.is_match("foobar"));
        assert!(!e.is_match("foobaz"));
    }

    #[test]
    fn case_insensitive() {
        let e = compile("HELLO", false, false, true).unwrap();
        assert!(e.is_match("hello world"));
        let e2 = compile("HELLO", false, true, true).unwrap();
        assert!(e2.is_match("hello world"));
    }

    #[test]
    fn invalid_pattern_errors() {
        let err = compile("[invalid", false, false, false).unwrap_err();
        assert!(matches!(err, PdfGrepError::RegexCompile { .. }));
    }
}