pdf2image_alt/
pdf.rs

1use futures::{future::BoxFuture, stream::FuturesOrdered, TryStreamExt};
2use std::process::Stdio;
3use tokio::io::AsyncWriteExt;
4use tokio::process::Command;
5
6use crate::error::{PDF2ImageError, Result};
7use crate::render_options::RenderOptions;
8
9pub struct PdfInfo {
10    /// The page count within the pdf
11    page_count: u32,
12    /// Whether the PDF is encrypted
13    encrypted: bool,
14}
15
16impl PdfInfo {
17    pub async fn read(data: &[u8]) -> Result<Self> {
18        let (page_count, encrypted) = extract_pdf_info(data).await?;
19
20        Ok(Self {
21            page_count,
22            encrypted,
23        })
24    }
25
26    /// Returns the number of pages in the PDF.
27    pub fn page_count(&self) -> u32 {
28        self.page_count
29    }
30
31    /// Returns whether the PDF is encrypted.
32    pub fn is_encrypted(&self) -> bool {
33        self.encrypted
34    }
35}
36
37#[derive(Debug, Clone)]
38/// Specifies which pages to render
39pub enum Pages {
40    All,
41    Range(std::ops::RangeInclusive<u32>),
42    Specific(Vec<u32>),
43}
44
45/// Renders the PDF to images.
46pub async fn render_pdf_single_page<'data, 'options: 'data>(
47    data: &'data [u8],
48    info: &'options PdfInfo,
49    page: u32,
50    options: &'options RenderOptions,
51) -> Result<image::DynamicImage> {
52    if info.encrypted && options.password.is_none() {
53        return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
54    }
55
56    let image = render_page(data, page, options).await?;
57
58    Ok(image)
59}
60
61/// Renders the PDF to images.
62pub async fn render_pdf_multi_page<'data, 'options: 'data>(
63    data: &'data [u8],
64    info: &'options PdfInfo,
65    pages: Pages,
66    options: &'options RenderOptions,
67) -> Result<Vec<image::DynamicImage>> {
68    if info.encrypted && options.password.is_none() {
69        return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
70    }
71
72    let valid_range = 0..=info.page_count;
73
74    let pages_range: Vec<u32> = match pages {
75        Pages::All => valid_range.collect(),
76        Pages::Range(range) => range // Filter only valid pages
77            .filter(|value| valid_range.contains(value))
78            .collect(),
79        Pages::Specific(pages) => pages // Filter only valid pages
80            .into_iter()
81            .filter(|value| valid_range.contains(value))
82            .collect(),
83    };
84
85    pages_range
86        .into_iter()
87        .map(|page| -> BoxFuture<'data, Result<image::DynamicImage>> {
88            Box::pin(render_page(data, page, options))
89        })
90        .collect::<FuturesOrdered<BoxFuture<'data, Result<image::DynamicImage>>>>()
91        .try_collect()
92        .await
93}
94
95/// Renders a specific page from the pdf file
96async fn render_page<'data, 'options: 'data>(
97    data: &'data [u8],
98    page: u32,
99    options: &'options RenderOptions,
100) -> Result<image::DynamicImage> {
101    let cli_options = options.to_cli_args();
102
103    let executable = get_executable_path(if options.pdftocairo {
104        "pdftocairo"
105    } else {
106        "pdftoppm"
107    });
108
109    let poppler_args: &[&str] = if options.pdftocairo {
110        &["-", "-", "-jpeg", "-singlefile"]
111    } else {
112        &["-jpeg", "-singlefile"]
113    };
114
115    let mut child = Command::new(&executable)
116        // Add the poppler args
117        .args(poppler_args)
118        // Add the page args
119        .args([
120            "-f".to_string(),
121            format!("{page}"),
122            "-l".to_string(),
123            format!("{page}"),
124        ])
125        // Add the cli options
126        .args(cli_options)
127        // Pipe input and output for use
128        .stdin(Stdio::piped())
129        .stdout(Stdio::piped())
130        .spawn()?;
131
132    // UNWRAP SAFETY: The child process is guaranteed to have a stdin as .stdin(Stdio::piped()) was called
133    child.stdin.as_mut().unwrap().write_all(data).await?;
134
135    let output = child.wait_with_output().await?;
136    let image = image::load_from_memory_with_format(&output.stdout, image::ImageFormat::Jpeg)?;
137
138    Ok(image)
139}
140
141/// Extracts the text contents of a pdf file from a single page
142pub async fn pdftext_single_page<'data, 'options: 'data>(
143    data: &'data [u8],
144    info: &'options PdfInfo,
145    page: u32,
146    options: &'options RenderOptions,
147) -> Result<String> {
148    if info.encrypted && options.password.is_none() {
149        return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
150    }
151
152    let image = render_page_text(data, page, options).await?;
153
154    Ok(image)
155}
156
157/// Extracts the text contents of a pdf file from multiple page
158///
159/// If you want all pages as a string it will likely be more performant
160/// to use [pdftext_multi_page]
161pub async fn pdftext_multi_page<'data, 'options: 'data>(
162    data: &'data [u8],
163    info: &'options PdfInfo,
164    pages: Pages,
165    options: &'options RenderOptions,
166) -> Result<String> {
167    if info.encrypted && options.password.is_none() {
168        return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
169    }
170
171    let valid_range = 0..=info.page_count;
172
173    let pages_range: Vec<u32> = match pages {
174        Pages::All => valid_range.collect(),
175        Pages::Range(range) => range // Filter only valid pages
176            .filter(|value| valid_range.contains(value))
177            .collect(),
178        Pages::Specific(pages) => pages // Filter only valid pages
179            .into_iter()
180            .filter(|value| valid_range.contains(value))
181            .collect(),
182    };
183
184    pages_range
185        .into_iter()
186        .map(|page| -> BoxFuture<'data, Result<String>> {
187            Box::pin(render_page_text(data, page, options))
188        })
189        .collect::<FuturesOrdered<BoxFuture<'data, Result<String>>>>()
190        .try_collect()
191        .await
192}
193
194/// Extracts the text contents of a pdf file from all pages as
195/// one big string, use [pdftext_multi_page] to get a separate
196/// string for each page
197pub async fn pdftext_all_pages<'data, 'options: 'data>(
198    data: &'data [u8],
199    info: &'options PdfInfo,
200    pages: Pages,
201    options: &'options RenderOptions,
202) -> Result<String> {
203    if info.encrypted && options.password.is_none() {
204        return Err(PDF2ImageError::NoPasswordForEncryptedPDF);
205    }
206
207    let valid_range = 0..=info.page_count;
208
209    let pages_range: Vec<u32> = match pages {
210        Pages::All => return render_all_pages_text(data, options).await,
211        Pages::Range(range) => range // Filter only valid pages
212            .filter(|value| valid_range.contains(value))
213            .collect(),
214        Pages::Specific(pages) => pages // Filter only valid pages
215            .into_iter()
216            .filter(|value| valid_range.contains(value))
217            .collect(),
218    };
219
220    pages_range
221        .into_iter()
222        .map(|page| -> BoxFuture<'data, Result<String>> {
223            Box::pin(render_page_text(data, page, options))
224        })
225        .collect::<FuturesOrdered<BoxFuture<'data, Result<String>>>>()
226        .try_collect()
227        .await
228}
229
230/// Renders a specific page from the pdf file
231async fn render_page_text<'data, 'options: 'data>(
232    data: &'data [u8],
233    page: u32,
234    options: &'options RenderOptions,
235) -> Result<String> {
236    let cli_options = options.to_cli_args();
237
238    let mut child = Command::new("pdftotext")
239        // Take input from stdin and provide to stdout
240        .args(["-", "-"])
241        // Add the page args
242        .args([
243            "-f".to_string(),
244            format!("{page}"),
245            "-l".to_string(),
246            format!("{page}"),
247        ])
248        // Add the cli options
249        .args(cli_options)
250        // Pipe input and output for use
251        .stdin(Stdio::piped())
252        .stdout(Stdio::piped())
253        .spawn()?;
254
255    // UNWRAP SAFETY: The child process is guaranteed to have a stdin as .stdin(Stdio::piped()) was called
256    child.stdin.as_mut().unwrap().write_all(data).await?;
257
258    let output = child.wait_with_output().await?;
259    let value = String::from_utf8_lossy(&output.stdout);
260
261    Ok(value.into_owned())
262}
263/// Renders a specific page from the pdf file
264async fn render_all_pages_text<'data, 'options: 'data>(
265    data: &'data [u8],
266    options: &'options RenderOptions,
267) -> Result<String> {
268    let cli_options = options.to_cli_args();
269
270    let mut child = Command::new("pdftotext")
271        // Take input from stdin and provide to stdout
272        .args(["-", "-"])
273        // Add the cli options
274        .args(cli_options)
275        // Pipe input and output for use
276        .stdin(Stdio::piped())
277        .stdout(Stdio::piped())
278        .spawn()?;
279
280    // UNWRAP SAFETY: The child process is guaranteed to have a stdin as .stdin(Stdio::piped()) was called
281    child.stdin.as_mut().unwrap().write_all(data).await?;
282
283    let output = child.wait_with_output().await?;
284    let value = String::from_utf8_lossy(&output.stdout);
285
286    Ok(value.into_owned())
287}
288
289/// Determines the executable path for the provided command
290pub fn get_executable_path(command: &str) -> String {
291    if let Ok(poppler_path) = std::env::var("PDF2IMAGE_POPPLER_PATH") {
292        #[cfg(target_os = "windows")]
293        return format!("{}\\{}.exe", poppler_path, command);
294        #[cfg(not(target_os = "windows"))]
295        return format!("{}/{}", poppler_path, command);
296    }
297
298    #[cfg(target_os = "windows")]
299    return format!("{}.exe", command);
300
301    #[cfg(not(target_os = "windows"))]
302    return command.to_string();
303}
304
305pub async fn extract_pdf_info(pdf: &[u8]) -> Result<(u32, bool)> {
306    let mut child = Command::new(get_executable_path("pdfinfo"))
307        .args(["-"])
308        .stdin(Stdio::piped())
309        .stdout(Stdio::piped())
310        .spawn()?;
311
312    // UNWRAP SAFETY: The child process is guaranteed to have a stdin as .stdin(Stdio::piped()) was called
313    child.stdin.as_mut().unwrap().write_all(pdf).await?;
314    let output = child.wait_with_output().await?;
315    let mut splits = output.stdout.split(|&x| x == b'\n');
316
317    let page_count: u32 = splits
318        .clone()
319        .find(|line| line.starts_with(b"Pages:"))
320        .map(|line| {
321            let line = std::str::from_utf8(line)?;
322            let pg_str = line
323                .split_whitespace()
324                .last()
325                .ok_or(PDF2ImageError::UnableToExtractPageCount)?;
326            pg_str
327                .parse::<u32>()
328                .map_err(|_| PDF2ImageError::UnableToExtractPageCount)
329        })
330        .ok_or(PDF2ImageError::UnableToExtractPageCount)??;
331
332    let encrypted = splits
333        .find(|line| line.starts_with(b"Encrypted:"))
334        .map(|line| {
335            let line = std::str::from_utf8(line)?;
336            Ok(
337                match line
338                    .split_whitespace()
339                    .last()
340                    .ok_or(PDF2ImageError::UnableToExtractEncryptionStatus)?
341                {
342                    "yes" => true,
343                    "no" => false,
344                    _ => return Err(PDF2ImageError::UnableToExtractEncryptionStatus),
345                },
346            )
347        })
348        .ok_or(PDF2ImageError::UnableToExtractEncryptionStatus)??;
349
350    Ok((page_count, encrypted))
351}