use futures_util::{stream::FuturesOrdered, TryStreamExt};
use std::process::Stdio;
use thiserror::Error;
use tokio::{io::AsyncWriteExt, process::Command};
use crate::{info::PdfInfo, shared::Password};
pub const PAGE_END_CHARACTER: char = '\u{c}';
#[derive(Debug, Error)]
pub enum PdfTextError {
#[error("failed to spawn pdftotext: {0}")]
SpawnProcess(std::io::Error),
#[error("failed to write pdf bytes: {0}")]
WritePdf(std::io::Error),
#[error("failed to get output: {0}")]
WaitOutput(std::io::Error),
#[error("failed to get pdfinfo exit code: {0}")]
PdfTextFailure(String),
#[error("page {0} is outside the number of available pages {1}")]
PageOutOfBounds(u32, u32),
#[error("page info page count is missing or invalid, pdf likely invalid")]
PageCountUnknown,
#[error("pdf is encrypted and no password was provided")]
PdfEncrypted,
#[error("incorrect password was provided")]
IncorrectPassword,
#[error("file is not a pdf")]
NotPdfFile,
}
#[derive(Debug, Default, Clone)]
pub struct PdfTextArgs {
pub password: Option<Password>,
}
impl PdfTextArgs {
pub fn set_password(mut self, password: Password) -> Self {
self.password = Some(password);
self
}
pub fn build_args(&self) -> Vec<String> {
let mut out = Vec::new();
if let Some(password) = self.password.as_ref() {
password.push_arg(&mut out);
}
out
}
}
pub async fn text_all_pages(data: &[u8], args: &PdfTextArgs) -> Result<String, PdfTextError> {
let value = pages_text(data, args).await?;
let value = value.replace(PAGE_END_CHARACTER, "\n");
Ok(value)
}
pub async fn text_all_pages_split(
data: &[u8],
args: &PdfTextArgs,
) -> Result<Vec<String>, PdfTextError> {
let out = pages_text(data, args).await?;
Ok(out
.split(PAGE_END_CHARACTER)
.map(|value| value.to_string())
.collect())
}
pub async fn text_pages(
data: &[u8],
info: &PdfInfo,
pages: Vec<u32>,
args: &PdfTextArgs,
) -> Result<Vec<String>, PdfTextError> {
let page_count = info
.pages()
.ok_or(PdfTextError::PageCountUnknown)?
.map_err(|_| PdfTextError::PageCountUnknown)?;
for page in &pages {
if *page > page_count {
return Err(PdfTextError::PageOutOfBounds(*page, page_count));
}
}
pages
.into_iter()
.map(|page| page_text(data, page, args))
.collect::<FuturesOrdered<_>>()
.try_collect()
.await
}
pub async fn text_single_page(
data: &[u8],
info: &PdfInfo,
page: u32,
args: &PdfTextArgs,
) -> Result<String, PdfTextError> {
let page_count = info
.pages()
.ok_or(PdfTextError::PageCountUnknown)?
.map_err(|_| PdfTextError::PageCountUnknown)?;
if page > page_count {
return Err(PdfTextError::PageOutOfBounds(page, page_count));
}
page_text(data, page, args).await
}
async fn pages_text(data: &[u8], args: &PdfTextArgs) -> Result<String, PdfTextError> {
let cli_args = args.build_args();
let mut child = Command::new("pdftotext")
.args(["-", "-"])
.args(cli_args)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(PdfTextError::SpawnProcess)?;
child
.stdin
.as_mut()
.expect("progress missing stdin after being piped")
.write_all(data)
.await
.map_err(PdfTextError::WritePdf)?;
let output = child
.wait_with_output()
.await
.map_err(PdfTextError::WaitOutput)?;
if !output.status.success() {
let value = String::from_utf8_lossy(&output.stderr);
if value.contains("May not be a PDF file") {
return Err(PdfTextError::NotPdfFile);
}
if value.contains("Incorrect password") {
return Err(if args.password.is_none() {
PdfTextError::PdfEncrypted
} else {
PdfTextError::IncorrectPassword
});
}
return Err(PdfTextError::PdfTextFailure(value.to_string()));
}
let value = String::from_utf8_lossy(&output.stdout);
Ok(value.into_owned())
}
async fn page_text(data: &[u8], page: u32, args: &PdfTextArgs) -> Result<String, PdfTextError> {
let cli_args = args.build_args();
let mut child = Command::new("pdftotext")
.args(["-", "-"])
.args([
"-f".to_string(),
format!("{page}"),
"-l".to_string(),
format!("{page}"),
])
.args(cli_args)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(PdfTextError::SpawnProcess)?;
child
.stdin
.as_mut()
.expect("progress missing stdin after being piped")
.write_all(data)
.await
.map_err(PdfTextError::WritePdf)?;
let output = child
.wait_with_output()
.await
.map_err(PdfTextError::WaitOutput)?;
if !output.status.success() {
let value = String::from_utf8_lossy(&output.stderr);
if value.contains("May not be a PDF file") {
return Err(PdfTextError::NotPdfFile);
}
if value.contains("Incorrect password") {
return Err(if args.password.is_none() {
PdfTextError::PdfEncrypted
} else {
PdfTextError::IncorrectPassword
});
}
return Err(PdfTextError::PdfTextFailure(value.to_string()));
}
let value = String::from_utf8_lossy(&output.stdout);
let mut value = value.to_string();
if value.ends_with(PAGE_END_CHARACTER) {
value.pop();
}
Ok(value)
}
#[cfg(test)]
mod test {
use crate::text::{page_text, pages_text, PdfTextArgs, PdfTextError};
use tokio::fs::read;
#[tokio::test]
async fn test_invalid_file() {
let err = pages_text(&[b'A'], &PdfTextArgs::default())
.await
.unwrap_err();
assert!(matches!(err, PdfTextError::NotPdfFile));
}
#[tokio::test]
async fn test_all_content() {
let expected = "Test pdf with text in it\n\n\u{c}";
let data = read("./tests/samples/test-pdf.pdf").await.unwrap();
let text = pages_text(&data, &PdfTextArgs::default()).await.unwrap();
assert_eq!(text.as_str(), expected);
}
#[tokio::test]
async fn test_specific_page() {
let data = read("./tests/samples/test-pdf-2-pages.pdf").await.unwrap();
let expected = "Test pdf with text in it\n\n";
let text = page_text(&data, 1, &PdfTextArgs::default()).await.unwrap();
assert_eq!(text.as_str(), expected);
let expected = "Test page 2\n\n";
let text = page_text(&data, 2, &PdfTextArgs::default()).await.unwrap();
assert_eq!(text.as_str(), expected);
}
}