use futures::{future::BoxFuture, stream::FuturesOrdered, TryStreamExt};
use std::process::Stdio;
use tokio::io::AsyncWriteExt;
use tokio::process::Command;
use crate::error::{PDF2ImageError, Result};
use crate::render_options::RenderOptions;
pub struct PdfInfo {
page_count: u32,
encrypted: bool,
}
impl PdfInfo {
pub async fn read(data: &[u8]) -> Result<Self> {
let (page_count, encrypted) = extract_pdf_info(data).await?;
Ok(Self {
page_count,
encrypted,
})
}
pub fn page_count(&self) -> u32 {
self.page_count
}
pub fn is_encrypted(&self) -> bool {
self.encrypted
}
}
#[derive(Debug, Clone)]
pub enum Pages {
All,
Range(std::ops::RangeInclusive<u32>),
Specific(Vec<u32>),
}
pub async fn render_pdf_single_page<'data, 'options: 'data>(
data: &'data [u8],
info: &'options PdfInfo,
page: u32,
options: &'options RenderOptions,
) -> Result<image::DynamicImage> {
if info.encrypted && options.password.is_none() {
return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
}
let image = render_page(data, page, options).await?;
Ok(image)
}
pub async fn render_pdf_multi_page<'data, 'options: 'data>(
data: &'data [u8],
info: &'options PdfInfo,
pages: Pages,
options: &'options RenderOptions,
) -> Result<Vec<image::DynamicImage>> {
if info.encrypted && options.password.is_none() {
return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
}
let valid_range = 0..=info.page_count;
let pages_range: Vec<u32> = match pages {
Pages::All => valid_range.collect(),
Pages::Range(range) => range .filter(|value| valid_range.contains(value))
.collect(),
Pages::Specific(pages) => pages .into_iter()
.filter(|value| valid_range.contains(value))
.collect(),
};
pages_range
.into_iter()
.map(|page| -> BoxFuture<'data, Result<image::DynamicImage>> {
Box::pin(render_page(data, page, options))
})
.collect::<FuturesOrdered<BoxFuture<'data, Result<image::DynamicImage>>>>()
.try_collect()
.await
}
async fn render_page<'data, 'options: 'data>(
data: &'data [u8],
page: u32,
options: &'options RenderOptions,
) -> Result<image::DynamicImage> {
let cli_options = options.to_cli_args();
let executable = get_executable_path(if options.pdftocairo {
"pdftocairo"
} else {
"pdftoppm"
});
let poppler_args: &[&str] = if options.pdftocairo {
&["-", "-", "-jpeg", "-singlefile"]
} else {
&["-jpeg", "-singlefile"]
};
let mut child = Command::new(&executable)
.args(poppler_args)
.args([
"-f".to_string(),
format!("{page}"),
"-l".to_string(),
format!("{page}"),
])
.args(cli_options)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
child.stdin.as_mut().unwrap().write_all(data).await?;
let output = child.wait_with_output().await?;
let image = image::load_from_memory_with_format(&output.stdout, image::ImageFormat::Jpeg)?;
Ok(image)
}
pub async fn pdftext_single_page<'data, 'options: 'data>(
data: &'data [u8],
info: &'options PdfInfo,
page: u32,
options: &'options RenderOptions,
) -> Result<String> {
if info.encrypted && options.password.is_none() {
return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
}
let image = render_page_text(data, page, options).await?;
Ok(image)
}
pub async fn pdftext_multi_page<'data, 'options: 'data>(
data: &'data [u8],
info: &'options PdfInfo,
pages: Pages,
options: &'options RenderOptions,
) -> Result<String> {
if info.encrypted && options.password.is_none() {
return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
}
let valid_range = 0..=info.page_count;
let pages_range: Vec<u32> = match pages {
Pages::All => valid_range.collect(),
Pages::Range(range) => range .filter(|value| valid_range.contains(value))
.collect(),
Pages::Specific(pages) => pages .into_iter()
.filter(|value| valid_range.contains(value))
.collect(),
};
pages_range
.into_iter()
.map(|page| -> BoxFuture<'data, Result<String>> {
Box::pin(render_page_text(data, page, options))
})
.collect::<FuturesOrdered<BoxFuture<'data, Result<String>>>>()
.try_collect()
.await
}
pub async fn pdftext_all_pages<'data, 'options: 'data>(
data: &'data [u8],
info: &'options PdfInfo,
pages: Pages,
options: &'options RenderOptions,
) -> Result<String> {
if info.encrypted && options.password.is_none() {
return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
}
let valid_range = 0..=info.page_count;
let pages_range: Vec<u32> = match pages {
Pages::All => return render_all_pages_text(data, options).await,
Pages::Range(range) => range .filter(|value| valid_range.contains(value))
.collect(),
Pages::Specific(pages) => pages .into_iter()
.filter(|value| valid_range.contains(value))
.collect(),
};
pages_range
.into_iter()
.map(|page| -> BoxFuture<'data, Result<String>> {
Box::pin(render_page_text(data, page, options))
})
.collect::<FuturesOrdered<BoxFuture<'data, Result<String>>>>()
.try_collect()
.await
}
async fn render_page_text<'data, 'options: 'data>(
data: &'data [u8],
page: u32,
options: &'options RenderOptions,
) -> Result<String> {
let cli_options = options.to_cli_args();
let mut child = Command::new("pdftotext")
.args(["-", "-"])
.args([
"-f".to_string(),
format!("{page}"),
"-l".to_string(),
format!("{page}"),
])
.args(cli_options)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
child.stdin.as_mut().unwrap().write_all(data).await?;
let output = child.wait_with_output().await?;
let value = String::from_utf8_lossy(&output.stdout);
Ok(value.into_owned())
}
async fn render_all_pages_text<'data, 'options: 'data>(
data: &'data [u8],
options: &'options RenderOptions,
) -> Result<String> {
let cli_options = options.to_cli_args();
let mut child = Command::new("pdftotext")
.args(["-", "-"])
.args(cli_options)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
child.stdin.as_mut().unwrap().write_all(data).await?;
let output = child.wait_with_output().await?;
let value = String::from_utf8_lossy(&output.stdout);
Ok(value.into_owned())
}
pub fn get_executable_path(command: &str) -> String {
if let Ok(poppler_path) = std::env::var("PDF2IMAGE_POPPLER_PATH") {
#[cfg(target_os = "windows")]
return format!("{}\\{}.exe", poppler_path, command);
#[cfg(not(target_os = "windows"))]
return format!("{}/{}", poppler_path, command);
}
#[cfg(target_os = "windows")]
return format!("{}.exe", command);
#[cfg(not(target_os = "windows"))]
return command.to_string();
}
pub async fn extract_pdf_info(pdf: &[u8]) -> Result<(u32, bool)> {
let mut child = Command::new(get_executable_path("pdfinfo"))
.args(["-"])
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
child.stdin.as_mut().unwrap().write_all(pdf).await?;
let output = child.wait_with_output().await?;
let mut splits = output.stdout.split(|&x| x == b'\n');
let page_count: u32 = splits
.clone()
.find(|line| line.starts_with(b"Pages:"))
.map(|line| {
let line = std::str::from_utf8(line)?;
let pg_str = line
.split_whitespace()
.last()
.ok_or(PDF2ImageError::UnableToExtractPageCount)?;
pg_str
.parse::<u32>()
.map_err(|_| PDF2ImageError::UnableToExtractPageCount)
})
.ok_or(PDF2ImageError::UnableToExtractPageCount)??;
let encrypted = splits
.find(|line| line.starts_with(b"Encrypted:"))
.map(|line| {
let line = std::str::from_utf8(line)?;
Ok(
match line
.split_whitespace()
.last()
.ok_or(PDF2ImageError::UnableToExtractEncryptionStatus)?
{
"yes" => true,
"no" => false,
_ => return Err(PDF2ImageError::UnableToExtractEncryptionStatus),
},
)
})
.ok_or(PDF2ImageError::UnableToExtractEncryptionStatus)??;
Ok((page_count, encrypted))
}