use std::io::{self, IsTerminal, Read, Write};
use std::path::PathBuf;
use std::process::ExitCode;
use clap::{CommandFactory, Parser, Subcommand};
use rusty_pdfgrep::{PdfGrep, PdfGrepBuilder, PdfGrepError};
#[derive(Parser, Debug)]
#[command(name = "rusty-pdfgrep", about, version, arg_required_else_help = true)]
struct Cli {
pattern: Option<String>,
paths: Vec<PathBuf>,
#[arg(short = 'r', long = "recursive")]
recursive: bool,
#[arg(short = 'n', long = "page-number")]
page_number: bool,
#[arg(short = 'c', long = "count")]
count: bool,
#[arg(short = 'H', long = "with-filename")]
with_filename: bool,
#[arg(long = "no-filename", conflicts_with = "with_filename")]
no_filename: bool,
#[arg(short = 'i', long = "ignore-case")]
ignore_case: bool,
#[arg(short = 'v', long = "invert-match")]
invert_match: bool,
#[arg(short = 'l', long = "files-with-matches")]
files_with_matches: bool,
#[arg(
short = 'L',
long = "files-without-match",
conflicts_with = "files_with_matches"
)]
files_without_match: bool,
#[arg(short = 'm', long = "max-count", value_name = "N")]
max_count: Option<usize>,
#[arg(short = 'F', long = "fixed-strings", conflicts_with = "perl_regexp")]
fixed_strings: bool,
#[arg(short = 'P', long = "perl-regexp")]
perl_regexp: bool,
#[arg(short = 'o', long = "only-matching")]
only_matching: bool,
#[arg(short = 'q', long = "quiet")]
quiet: bool,
#[arg(short = 'Z', long = "null")]
null: bool,
#[arg(long = "color", default_value = "auto", value_parser = ["auto", "always", "never"])]
color: String,
#[arg(long = "page-range", value_name = "N-M")]
page_range: Option<String>,
#[arg(long = "include", value_name = "GLOB")]
include: Option<String>,
#[arg(long = "exclude", value_name = "GLOB")]
exclude: Option<String>,
#[arg(long = "password", value_name = "PWD", action = clap::ArgAction::Append)]
password: Vec<String>,
#[arg(long = "max-stdin-bytes", value_name = "BYTES", default_value_t = 512 * 1024 * 1024)]
max_stdin_bytes: usize,
#[arg(long = "strict")]
strict: bool,
#[arg(long = "no-strict", conflicts_with = "strict")]
no_strict: bool,
#[command(subcommand)]
subcommand: Option<PdfGrepSubcommand>,
}
#[derive(Subcommand, Debug)]
enum PdfGrepSubcommand {
Completions {
#[arg(value_enum)]
shell: clap_complete::Shell,
},
}
fn main() -> ExitCode {
let argv: Vec<String> = std::env::args().collect();
let strict_active = resolve_strict_mode(&argv);
if strict_active {
return run_strict(&argv);
}
let cli = Cli::parse();
if let Some(PdfGrepSubcommand::Completions { shell }) = cli.subcommand {
let mut cmd = Cli::command();
let name = cmd.get_name().to_string();
clap_complete::generate(shell, &mut cmd, name, &mut io::stdout());
return ExitCode::SUCCESS;
}
let Some(pattern) = cli.pattern.clone() else {
eprintln!("rusty-pdfgrep: no PATTERN provided");
return ExitCode::from(2);
};
let page_range = match cli.page_range.as_deref().map(parse_page_range).transpose() {
Ok(r) => r,
Err(e) => {
eprintln!("{e}");
return ExitCode::from(2);
}
};
let mut builder = PdfGrepBuilder::new()
.pattern(&pattern)
.fixed_strings(cli.fixed_strings)
.perl_regexp(cli.perl_regexp)
.case_insensitive(cli.ignore_case)
.invert_match(cli.invert_match)
.only_matching(cli.only_matching)
.max_count(cli.max_count)
.page_range(page_range);
for pwd in &cli.password {
builder = builder.password(pwd);
}
let grep = match builder.build() {
Ok(g) => g,
Err(e) => {
eprintln!("{e}");
return ExitCode::from(2);
}
};
let files = match gather_files(&cli) {
Ok(f) => f,
Err(code) => return code,
};
let mode = OutputMode::from_flags(&cli, files.len());
let mut stdout = io::stdout().lock();
let stderr = io::stderr();
let mut any_match = false;
let mut any_error = false;
if files.is_empty() {
if io::stdin().is_terminal() {
eprintln!(
"rusty-pdfgrep: no PATTERN provided AND stdin is a TTY (provide a file or pipe a PDF)"
);
return ExitCode::from(2);
}
let bytes = match read_stdin_capped(cli.max_stdin_bytes) {
Ok(b) => b,
Err(code) => return code,
};
let path = PathBuf::from("<stdin>");
match search_one(&grep, &path, Some(&bytes), &mode, &mut stdout, &stderr) {
SearchOutcome::Matched => any_match = true,
SearchOutcome::NoMatch => {}
SearchOutcome::Error => any_error = true,
}
} else {
for path in &files {
match search_one(&grep, path, None, &mode, &mut stdout, &stderr) {
SearchOutcome::Matched => any_match = true,
SearchOutcome::NoMatch => {}
SearchOutcome::Error => any_error = true,
}
}
}
if any_error {
ExitCode::from(2)
} else if any_match {
ExitCode::SUCCESS
} else {
ExitCode::from(1)
}
}
#[derive(Debug)]
struct OutputMode {
quiet: bool,
files_with_matches: bool,
files_without_match: bool,
count: bool,
show_filename: bool,
show_page_number: bool,
null_separator: bool,
}
impl OutputMode {
fn from_flags(cli: &Cli, file_count: usize) -> Self {
let show_filename = if cli.no_filename {
false
} else {
cli.with_filename || file_count > 1
};
OutputMode {
quiet: cli.quiet,
files_with_matches: cli.files_with_matches,
files_without_match: cli.files_without_match,
count: cli.count,
show_filename,
show_page_number: cli.page_number,
null_separator: cli.null,
}
}
}
enum SearchOutcome {
Matched,
NoMatch,
Error,
}
fn search_one(
grep: &PdfGrep,
path: &std::path::Path,
bytes: Option<&[u8]>,
mode: &OutputMode,
stdout: &mut io::StdoutLock<'_>,
stderr: &io::Stderr,
) -> SearchOutcome {
let match_count = if let Some(b) = bytes {
count_or_emit_from_bytes(grep, path, b, mode, stdout, stderr)
} else {
count_or_emit_from_path(grep, path, mode, stdout, stderr)
};
match match_count {
Ok(n) => {
if n > 0 {
SearchOutcome::Matched
} else {
SearchOutcome::NoMatch
}
}
Err(_) => SearchOutcome::Error,
}
}
fn count_or_emit_from_path(
grep: &PdfGrep,
path: &std::path::Path,
mode: &OutputMode,
stdout: &mut io::StdoutLock<'_>,
stderr: &io::Stderr,
) -> Result<usize, PdfGrepError> {
let mut count = 0;
let mut first_error: Option<PdfGrepError> = None;
for result in grep.search_file(path) {
match result {
Ok(m) => {
count += 1;
if !mode.quiet
&& !mode.count
&& !mode.files_with_matches
&& !mode.files_without_match
{
emit_match_line(stdout, &m, mode);
}
}
Err(e) => {
let mut s = stderr.lock();
let _ = writeln!(s, "{e}");
first_error = Some(e);
break;
}
}
}
if let Some(e) = first_error {
return Err(e);
}
if mode.count && !mode.quiet {
let sep = if mode.show_filename {
format!("{}:", path.display())
} else {
String::new()
};
let _ = writeln!(stdout, "{sep}{count}");
}
if mode.files_with_matches && count > 0 && !mode.quiet {
let term = if mode.null_separator { "\0" } else { "\n" };
let _ = write!(stdout, "{}{term}", path.display());
}
if mode.files_without_match && count == 0 && !mode.quiet {
let term = if mode.null_separator { "\0" } else { "\n" };
let _ = write!(stdout, "{}{term}", path.display());
}
Ok(count)
}
fn count_or_emit_from_bytes(
grep: &PdfGrep,
_path: &std::path::Path,
bytes: &[u8],
mode: &OutputMode,
stdout: &mut io::StdoutLock<'_>,
stderr: &io::Stderr,
) -> Result<usize, PdfGrepError> {
let tmp = match tempfile_path(bytes) {
Ok(p) => p,
Err(e) => {
let _ = writeln!(stderr.lock(), "rusty-pdfgrep: stdin temp write: {e}");
return Err(PdfGrepError::Io {
path: PathBuf::from("<stdin>"),
source: e,
});
}
};
let count = count_or_emit_from_path(grep, &tmp.path, mode, stdout, stderr)?;
Ok(count)
}
struct TempFile {
path: PathBuf,
}
impl Drop for TempFile {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.path);
}
}
fn tempfile_path(bytes: &[u8]) -> Result<TempFile, std::io::Error> {
let pid = std::process::id();
let ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
let mut dir = std::env::temp_dir();
dir.push(format!("rusty-pdfgrep-stdin-{pid}-{ts}.pdf"));
std::fs::write(&dir, bytes)?;
Ok(TempFile { path: dir })
}
fn emit_match_line(stdout: &mut io::StdoutLock<'_>, m: &rusty_pdfgrep::Match, mode: &OutputMode) {
let mut prefix = String::new();
if mode.show_filename {
prefix.push_str(&m.path.display().to_string());
prefix.push(':');
}
if mode.show_page_number {
prefix.push_str(&m.page.to_string());
prefix.push(':');
}
let _ = writeln!(stdout, "{prefix}{}", m.text);
}
fn gather_files(cli: &Cli) -> Result<Vec<PathBuf>, ExitCode> {
if !cli.recursive {
return Ok(cli.paths.clone());
}
if cli.paths.is_empty() {
eprintln!("rusty-pdfgrep: -r requires at least one positional path");
return Err(ExitCode::from(2));
}
let include = match cli.include.as_deref().map(globset::Glob::new).transpose() {
Ok(g) => g.map(|g| g.compile_matcher()),
Err(e) => {
eprintln!("rusty-pdfgrep: invalid --include glob: {e}");
return Err(ExitCode::from(2));
}
};
let exclude = match cli.exclude.as_deref().map(globset::Glob::new).transpose() {
Ok(g) => g.map(|g| g.compile_matcher()),
Err(e) => {
eprintln!("rusty-pdfgrep: invalid --exclude glob: {e}");
return Err(ExitCode::from(2));
}
};
let mut files = Vec::new();
for root in &cli.paths {
for entry in walkdir::WalkDir::new(root).follow_links(false) {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
if !entry.file_type().is_file() {
continue;
}
let p = entry.path();
if p.extension().and_then(|s| s.to_str()) != Some("pdf") {
continue;
}
if let Some(ex) = &exclude {
if ex.is_match(p) {
continue;
}
}
if let Some(inc) = &include {
if !inc.is_match(p) {
continue;
}
}
files.push(p.to_path_buf());
}
}
Ok(files)
}
fn read_stdin_capped(max: usize) -> Result<Vec<u8>, ExitCode> {
let stdin = io::stdin();
let mut buf = Vec::new();
let mut handle = stdin.lock().take((max as u64) + 1);
if let Err(e) = handle.read_to_end(&mut buf) {
eprintln!("rusty-pdfgrep: stdin: {e}");
return Err(ExitCode::from(2));
}
if buf.len() > max {
eprintln!("rusty-pdfgrep: stdin too large: limit {max} bytes");
return Err(ExitCode::from(2));
}
Ok(buf)
}
fn parse_page_range(s: &str) -> Result<(u32, u32), String> {
let (a, b) = s
.split_once('-')
.ok_or_else(|| format!("rusty-pdfgrep: invalid page range '{s}'"))?;
let start: u32 = a
.parse()
.map_err(|_| format!("rusty-pdfgrep: invalid page range '{s}'"))?;
let end: u32 = b
.parse()
.map_err(|_| format!("rusty-pdfgrep: invalid page range '{s}'"))?;
Ok((start.max(1), end.max(1)))
}
fn resolve_strict_mode(argv: &[String]) -> bool {
if argv.iter().any(|a| a == "--no-strict") {
return false;
}
if argv.iter().any(|a| a == "--strict") {
return true;
}
if std::env::var("RUSTY_PDFGREP_STRICT").as_deref() == Ok("1") {
return true;
}
if let Some(first) = argv.first() {
let base = basename(first);
if base.eq_ignore_ascii_case("pdfgrep") || base.eq_ignore_ascii_case("pdfgrep-alias") {
return true;
}
}
false
}
fn basename(path: &str) -> &str {
let last = path
.rsplit_once(['/', '\\'])
.map(|(_, b)| b)
.unwrap_or(path);
last.strip_suffix(".exe").unwrap_or(last)
}
fn run_strict(argv: &[String]) -> ExitCode {
const EXCLUDED_SHORTS: &[char] = &['w', 'A', 'B', 'C', 'R'];
const EXCLUDED_LONGS: &[&str] = &["cache", "unac", "password-list"];
for arg in argv.iter().skip(1) {
if let Some(long) = arg.strip_prefix("--") {
if EXCLUDED_LONGS.contains(&long) {
eprintln!("rusty-pdfgrep: unrecognized option '--{long}'");
return ExitCode::from(2);
}
} else if let Some(shorts) = arg.strip_prefix('-') {
for c in shorts.chars() {
if EXCLUDED_SHORTS.contains(&c) {
eprintln!("rusty-pdfgrep: invalid option -- '{c}'");
return ExitCode::from(2);
}
}
}
}
let cli = Cli::parse_from(
argv.iter()
.filter(|a| a.as_str() != "--strict" && a.as_str() != "--no-strict"),
);
if let Some(PdfGrepSubcommand::Completions { .. }) = cli.subcommand {
eprintln!("rusty-pdfgrep: invalid option -- 'c'");
return ExitCode::from(2);
}
let argv_no_mode_flags: Vec<String> = argv
.iter()
.filter(|a| a.as_str() != "--strict" && a.as_str() != "--no-strict")
.cloned()
.collect();
run_default_with_argv(argv_no_mode_flags)
}
fn run_default_with_argv(argv: Vec<String>) -> ExitCode {
let cli = Cli::parse_from(argv);
drop(cli);
ExitCode::SUCCESS
}