docbox_processing/
pdf.rs

1use crate::{
2    ProcessingError, ProcessingIndexMetadata, ProcessingOutput, QueuedUpload,
3    image::create_img_bytes,
4};
5use docbox_database::models::generated_file::GeneratedFileType;
6use docbox_search::models::DocumentPage;
7use futures::TryFutureExt;
8use image::{DynamicImage, ImageError, ImageFormat, ImageResult};
9use mime::Mime;
10use pdf_process::{
11    OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfRenderError, PdfTextArgs, RenderArgs,
12    pdf_info, render_single_page, text::PAGE_END_CHARACTER, text_all_pages_split,
13};
14use std::str::Split;
15use thiserror::Error;
16use tokio::task::JoinError;
17
18pub struct GeneratedPdfImages {
19    /// Rendered full sized first page
20    pub cover_page_jpeg: Vec<u8>,
21    /// Small 64x64 file thumbnail
22    pub thumbnail_jpeg: Vec<u8>,
23    /// Smaller 385x385 version of first page
24    /// (Not actually 385x385 fits whatever the image aspect ratio inside those dimensions)
25    pub large_thumbnail_jpeg: Vec<u8>,
26}
27
28#[derive(Debug, Error)]
29pub enum GeneratePdfImagesError {
30    /// PDF rendering error
31    #[error("failed to render pdf: {0}")]
32    PdfRender(#[from] PdfRenderError),
33
34    /// Image processing error
35    #[error("error processing image: {0}")]
36    ImageError(#[from] ImageError),
37
38    /// Failed to join the image processing thread output
39    #[error("error waiting for image processing")]
40    Threading(#[from] JoinError),
41}
42
43/// Processes a PDF compatible file producing index data and generated files such as
44/// thumbnails and a converted pdf version
45///
46/// Extracts text from the PDF and creates multiple thumbnail preview images
47/// of the first page at various sizes
48pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
49    let pdf_info_args = PdfInfoArgs::default();
50
51    // Load the pdf information
52    let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
53        Ok(value) => value,
54        // Skip processing encrypted pdf files
55        Err(PdfInfoError::PdfEncrypted) => {
56            return Ok(ProcessingOutput {
57                encrypted: true,
58                ..Default::default()
59            });
60        }
61        // Handle invalid file
62        Err(PdfInfoError::NotPdfFile) => {
63            return Err(ProcessingError::MalformedFile(
64                "file was not a pdf file".to_string(),
65            ));
66        }
67
68        // Handle other errors
69        Err(cause) => {
70            tracing::error!(?cause, "failed to get pdf file info");
71            return Err(ProcessingError::ReadPdfInfo(cause));
72        }
73    };
74
75    let page_count = pdf_info
76        .pages()
77        .ok_or_else(|| {
78            ProcessingError::MalformedFile("failed to determine page count".to_string())
79        })?
80        .map_err(|err| {
81            ProcessingError::MalformedFile(format!(
82                "failed to convert pages number to integer: {err}"
83            ))
84        })?;
85
86    // For processing the pdf file must have minimum 1 page
87    if page_count < 1 {
88        tracing::debug!("skipping processing on pdf with no pages");
89        return Ok(ProcessingOutput::default());
90    }
91
92    tracing::debug!("generating file thumbnails & extracting text content");
93
94    let text_args = PdfTextArgs::default();
95
96    // Extract pdf text
97    let pages_text_future = text_all_pages_split(file_bytes, &text_args)
98        // Match outer result type with inner type
99        .map_err(ProcessingError::ExtractFileText);
100
101    // Generate pdf thumbnails
102    let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
103        .map_err(ProcessingError::GeneratePdfThumbnail);
104
105    let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;
106
107    // Create a combined text content using the PDF page end character
108    let page_end = PAGE_END_CHARACTER.to_string();
109    let combined_text_content = pages.join(&page_end).as_bytes().to_vec();
110
111    let index_metadata = ProcessingIndexMetadata {
112        pages: Some(
113            pages
114                .into_iter()
115                .enumerate()
116                .map(|(page, content)| DocumentPage {
117                    page: page as u64,
118                    content,
119                })
120                .collect(),
121        ),
122    };
123
124    let upload_queue = vec![
125        QueuedUpload::new(
126            mime::IMAGE_JPEG,
127            GeneratedFileType::CoverPage,
128            generated.cover_page_jpeg.into(),
129        ),
130        QueuedUpload::new(
131            mime::IMAGE_JPEG,
132            GeneratedFileType::LargeThumbnail,
133            generated.large_thumbnail_jpeg.into(),
134        ),
135        QueuedUpload::new(
136            mime::IMAGE_JPEG,
137            GeneratedFileType::SmallThumbnail,
138            generated.thumbnail_jpeg.into(),
139        ),
140        QueuedUpload::new(
141            mime::TEXT_PLAIN,
142            GeneratedFileType::TextContent,
143            combined_text_content.into(),
144        ),
145    ];
146
147    Ok(ProcessingOutput {
148        encrypted: false,
149        additional_files: Default::default(),
150        index_metadata: Some(index_metadata),
151        upload_queue,
152    })
153}
154
155/// Check if the provided mime type is for a PDF
156#[inline]
157pub fn is_pdf_file(mime: &Mime) -> bool {
158    if mime.eq(&mime::APPLICATION_PDF) {
159        return true;
160    }
161
162    // Some outdated clients use application/x-pdf for pdf
163    if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
164        return true;
165    }
166
167    false
168}
169
170/// Renders the cover page for a PDF file
171async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> Result<DynamicImage, PdfRenderError> {
172    let args = RenderArgs::default();
173    let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args).await?;
174
175    Ok(page)
176}
177
178/// Asynchronously generate pdf cover image and its variants
179async fn generate_pdf_images_async(
180    pdf_info: &PdfInfo,
181    pdf: &[u8],
182) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
183    tracing::debug!("rendering pdf cover");
184    let page = render_pdf_cover(pdf_info, pdf).await?;
185
186    tracing::debug!("rendering pdf image variants");
187    let result = generate_pdf_images_variants_async(page).await?;
188    Ok(result)
189}
190
191/// Async wrapper around [generate_pdf_images_variants]
192async fn generate_pdf_images_variants_async(
193    cover_page: DynamicImage,
194) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
195    let result =
196        tokio::task::spawn_blocking(move || generate_pdf_images_variants(cover_page)).await??;
197    Ok(result)
198}
199
200/// Generates the various versions of the PDF cover images
201fn generate_pdf_images_variants(cover_page: DynamicImage) -> ImageResult<GeneratedPdfImages> {
202    tracing::debug!("rendering pdf image variants");
203    let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;
204
205    let thumbnail_jpeg = {
206        let thumbnail = cover_page.thumbnail(64, 64);
207        create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
208    };
209
210    let large_thumbnail_jpeg = {
211        let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
212        create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
213    };
214
215    Ok(GeneratedPdfImages {
216        cover_page_jpeg,
217        thumbnail_jpeg,
218        large_thumbnail_jpeg,
219    })
220}
221
222/// Split the merged text content of a PDF based on the page end character (Split by page)
223pub fn split_pdf_text_pages(text: &str) -> Split<'_, char> {
224    text.split(PAGE_END_CHARACTER)
225}