use std::sync::mpsc;
use std::thread;
use std::time::Duration;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use crate::error::{Error, Result};
pub const DEFAULT_MAX_BYTES: usize = 50 * 1024 * 1024;
pub const DEFAULT_TIMEOUT_SECS: u64 = 30;
#[derive(Copy, Clone, Debug)]
pub struct PdfOptions {
pub max_bytes: usize,
pub timeout_secs: u64,
}
impl Default for PdfOptions {
fn default() -> Self {
Self {
max_bytes: DEFAULT_MAX_BYTES,
timeout_secs: DEFAULT_TIMEOUT_SECS,
}
}
}
pub fn pdf_to_text(bytes: &[u8]) -> Result<String> {
pdf_to_text_with(bytes, PdfOptions::default())
}
pub fn pdf_to_text_with(bytes: &[u8], opts: PdfOptions) -> Result<String> {
if bytes.len() > opts.max_bytes {
return Err(Error::InvalidInput(alloc::format!(
"pdf input ({} bytes) exceeds {}-byte cap",
bytes.len(),
opts.max_bytes
)));
}
let raw = run_with_timeout(bytes, Duration::from_secs(opts.timeout_secs))?;
Ok(sanitize(&raw))
}
fn run_with_timeout(bytes: &[u8], timeout: Duration) -> Result<String> {
let buf: Vec<u8> = bytes.to_vec();
let (tx, rx) = mpsc::channel::<Result<String>>();
let _handle = thread::spawn(move || {
let r = pdf_extract::extract_text_from_mem(&buf)
.map_err(|e| Error::InvalidInput(alloc::format!("pdf parse error: {e}")));
let _ = tx.send(r);
});
match rx.recv_timeout(timeout) {
Ok(result) => result,
Err(mpsc::RecvTimeoutError::Timeout) => Err(Error::InvalidInput(alloc::format!(
"pdf parse exceeded {}-second timeout",
timeout.as_secs()
))),
Err(mpsc::RecvTimeoutError::Disconnected) => {
Err(Error::InvalidInput("pdf parser panicked".to_string()))
}
}
}
fn sanitize(text: &str) -> String {
if !text.contains('\u{0}') {
return text.to_owned();
}
text.replace('\u{0}', "\u{FFFD}")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rejects_oversized_input() {
let big = alloc::vec![0u8; 1024];
let r = pdf_to_text_with(
&big,
PdfOptions {
max_bytes: 100,
timeout_secs: 30,
},
);
assert!(matches!(r, Err(Error::InvalidInput(_))));
}
#[test]
fn invalid_pdf_errors_cleanly() {
let r = pdf_to_text(b"not a pdf");
assert!(r.is_err());
}
#[test]
fn sanitize_replaces_nul() {
assert_eq!(sanitize("a\u{0}b"), "a\u{FFFD}b");
assert_eq!(sanitize("plain"), "plain");
}
#[test]
fn defaults_are_documented_constants() {
let o = PdfOptions::default();
assert_eq!(o.max_bytes, DEFAULT_MAX_BYTES);
assert_eq!(o.timeout_secs, DEFAULT_TIMEOUT_SECS);
}
}