#![deny(missing_docs)]
pub mod engine;
pub mod error;
pub mod pdf;
pub use error::PdfGrepError;
use std::path::{Path, PathBuf};
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Match {
pub path: PathBuf,
pub page: u32,
pub text: String,
pub byte_span: (usize, usize),
}
pub struct PdfGrep {
engine: engine::Engine,
invert_match: bool,
only_matching: bool,
max_count: Option<usize>,
page_range: Option<(u32, u32)>,
passwords: Vec<String>,
}
impl PdfGrep {
pub fn search_file<'a>(&'a self, path: &Path) -> PageIterator<'a> {
PageIterator::new(self, path.to_path_buf())
}
pub fn search_file_collected(&self, path: &Path) -> Result<Vec<Match>, PdfGrepError> {
self.search_file(path).collect()
}
#[must_use]
pub fn invert_match(&self) -> bool {
self.invert_match
}
#[must_use]
pub fn only_matching(&self) -> bool {
self.only_matching
}
#[must_use]
pub fn max_count(&self) -> Option<usize> {
self.max_count
}
#[must_use]
pub fn page_range(&self) -> Option<(u32, u32)> {
self.page_range
}
#[must_use]
pub fn passwords(&self) -> &[String] {
&self.passwords
}
}
impl std::fmt::Debug for PdfGrep {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PdfGrep")
.field("invert_match", &self.invert_match)
.field("only_matching", &self.only_matching)
.field("max_count", &self.max_count)
.field("page_range", &self.page_range)
.field("passwords", &format!("<{} entries>", self.passwords.len()))
.finish()
}
}
#[derive(Debug, Clone, Default)]
pub struct PdfGrepBuilder {
pattern: Option<String>,
fixed_strings: bool,
perl_regexp: bool,
case_insensitive: bool,
invert_match: bool,
only_matching: bool,
max_count: Option<usize>,
page_range: Option<(u32, u32)>,
passwords: Vec<String>,
}
impl PdfGrepBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn pattern(mut self, p: impl Into<String>) -> Self {
self.pattern = Some(p.into());
self
}
#[must_use]
pub fn fixed_strings(mut self, on: bool) -> Self {
self.fixed_strings = on;
self
}
#[must_use]
pub fn perl_regexp(mut self, on: bool) -> Self {
self.perl_regexp = on;
self
}
#[must_use]
pub fn case_insensitive(mut self, on: bool) -> Self {
self.case_insensitive = on;
self
}
#[must_use]
pub fn invert_match(mut self, on: bool) -> Self {
self.invert_match = on;
self
}
#[must_use]
pub fn only_matching(mut self, on: bool) -> Self {
self.only_matching = on;
self
}
#[must_use]
pub fn max_count(mut self, n: Option<usize>) -> Self {
self.max_count = n;
self
}
#[must_use]
pub fn page_range(mut self, range: Option<(u32, u32)>) -> Self {
self.page_range = range;
self
}
#[must_use]
pub fn password(mut self, pwd: impl Into<String>) -> Self {
self.passwords.push(pwd.into());
self
}
pub fn build(self) -> Result<PdfGrep, PdfGrepError> {
let pattern = self.pattern.unwrap_or_default();
let engine = engine::compile(
&pattern,
self.fixed_strings,
self.perl_regexp,
self.case_insensitive,
)?;
if let Some((start, end)) = self.page_range {
if start > end {
return Err(PdfGrepError::PageRange {
value: format!("{start}-{end}"),
});
}
}
Ok(PdfGrep {
engine,
invert_match: self.invert_match,
only_matching: self.only_matching,
max_count: self.max_count,
page_range: self.page_range,
passwords: self.passwords,
})
}
}
pub struct PageIterator<'a> {
grep: &'a PdfGrep,
path: PathBuf,
doc: Option<pdf::PdfDocument>,
init_error: Option<PdfGrepError>,
page_idx: usize,
page_numbers: Vec<u32>,
current_text: Option<String>,
current_matches: Vec<(usize, usize)>,
current_match_idx: usize,
yielded: usize,
started: bool,
}
impl<'a> PageIterator<'a> {
fn new(grep: &'a PdfGrep, path: PathBuf) -> Self {
PageIterator {
grep,
path,
doc: None,
init_error: None,
page_idx: 0,
page_numbers: Vec::new(),
current_text: None,
current_matches: Vec::new(),
current_match_idx: 0,
yielded: 0,
started: false,
}
}
fn ensure_started(&mut self) {
if self.started {
return;
}
self.started = true;
match pdf::PdfDocument::open(&self.path, &self.grep.passwords) {
Ok(doc) => {
let mut nums: Vec<u32> = doc.page_numbers().to_vec();
if let Some((start, end)) = self.grep.page_range {
nums.retain(|&n| n >= start && n <= end);
}
self.page_numbers = nums;
self.doc = Some(doc);
}
Err(e) => {
self.init_error = Some(e);
}
}
}
}
impl Iterator for PageIterator<'_> {
type Item = Result<Match, PdfGrepError>;
fn next(&mut self) -> Option<Self::Item> {
self.ensure_started();
if let Some(err) = self.init_error.take() {
return Some(Err(err));
}
let doc = self.doc.as_ref()?;
if let Some(cap) = self.grep.max_count {
if self.yielded >= cap {
return None;
}
}
loop {
if let Some(text) = &self.current_text {
if self.current_match_idx < self.current_matches.len() {
let (start, end) = self.current_matches[self.current_match_idx];
self.current_match_idx += 1;
let line = containing_line(text, start, end);
let (line_start, line_end) = line;
let line_text = text[line_start..line_end].to_string();
let span_in_line = (start - line_start, end - line_start);
let m = Match {
path: self.path.clone(),
page: self
.page_numbers
.get(self.page_idx - 1)
.copied()
.unwrap_or(0),
text: if self.grep.only_matching {
text[start..end].to_string()
} else {
line_text
},
byte_span: if self.grep.only_matching {
(0, end - start)
} else {
span_in_line
},
};
self.yielded += 1;
return Some(Ok(m));
}
self.current_text = None;
self.current_matches.clear();
self.current_match_idx = 0;
}
if self.page_idx >= self.page_numbers.len() {
return None;
}
let page = self.page_numbers[self.page_idx];
self.page_idx += 1;
match doc.extract_page(page) {
Ok(text) => {
let matches = self.grep.engine.find_all(&text);
if self.grep.invert_match {
if matches.is_empty() && !text.is_empty() {
let m = Match {
path: self.path.clone(),
page,
text: text.clone(),
byte_span: (0, 0),
};
self.yielded += 1;
return Some(Ok(m));
}
continue;
}
self.current_text = Some(text);
self.current_matches = matches;
self.current_match_idx = 0;
}
Err(msg) => {
eprintln!("rusty-pdfgrep: {}: {msg}", self.path.display());
continue;
}
}
}
}
}
fn containing_line(text: &str, match_start: usize, match_end: usize) -> (usize, usize) {
let line_start = text[..match_start].rfind('\n').map(|i| i + 1).unwrap_or(0);
let line_end = text[match_end..]
.find('\n')
.map(|i| match_end + i)
.unwrap_or(text.len());
(line_start, line_end)
}
#[cfg(test)]
mod tests {
use super::*;
use static_assertions::assert_impl_all;
assert_impl_all!(PdfGrep: Send);
assert_impl_all!(PdfGrepBuilder: Send, Sync);
assert_impl_all!(Match: Send, Sync);
assert_impl_all!(PdfGrepError: Send, Sync);
#[test]
fn builder_requires_no_pattern_to_build() {
let g = PdfGrepBuilder::new().build();
assert!(g.is_ok());
}
#[test]
fn builder_invalid_regex_returns_err() {
let err = PdfGrepBuilder::new()
.pattern("[invalid")
.build()
.unwrap_err();
assert!(matches!(err, PdfGrepError::RegexCompile { .. }));
}
#[test]
fn builder_reverse_page_range_returns_err() {
let err = PdfGrepBuilder::new()
.pattern("x")
.page_range(Some((5, 3)))
.build()
.unwrap_err();
assert!(matches!(err, PdfGrepError::PageRange { .. }));
}
#[test]
fn builder_password_appends_in_order() {
let g = PdfGrepBuilder::new()
.pattern("x")
.password("a")
.password("b")
.password("c")
.build()
.unwrap();
assert_eq!(g.passwords(), &["a", "b", "c"]);
}
#[test]
fn containing_line_extracts_correctly() {
let text = "first line\nsecond match here\nthird line";
let (s, e) = containing_line(text, 18, 23);
assert_eq!(&text[s..e], "second match here");
}
#[test]
fn containing_line_no_newlines_returns_full_text() {
let text = "single line no newlines";
let (s, e) = containing_line(text, 7, 11);
assert_eq!((s, e), (0, text.len()));
}
}