docbox_core/processing/
pdf.rs

1use crate::{
2    files::generated::QueuedUpload,
3    processing::{
4        ProcessingError, ProcessingIndexMetadata, ProcessingOutput, image::create_img_bytes,
5    },
6};
7use anyhow::Context;
8use docbox_database::models::generated_file::GeneratedFileType;
9use docbox_search::models::DocumentPage;
10use futures::TryFutureExt;
11use image::{DynamicImage, ImageFormat};
12use mime::Mime;
13use pdf_process::{
14    OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfTextArgs, RenderArgs, pdf_info,
15    render_single_page, text_all_pages_split,
16};
17
18pub use pdf_process::text::PAGE_END_CHARACTER;
19
20/// Processes a PDF compatible file producing index data and generated files such as
21/// thumbnails and a converted pdf version
22///
23/// Extracts text from the PDF and creates multiple thumbnail preview images
24/// of the first page at various sizes
25pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
26    let pdf_info_args = PdfInfoArgs::default();
27
28    // Load the pdf information
29    let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
30        Ok(value) => value,
31        // Skip processing encrypted pdf files
32        Err(PdfInfoError::PdfEncrypted) => {
33            return Ok(ProcessingOutput {
34                encrypted: true,
35                ..Default::default()
36            });
37        }
38        // Handle invalid file
39        Err(PdfInfoError::NotPdfFile) => {
40            return Err(ProcessingError::MalformedFile(
41                "file was not a pdf file".to_string(),
42            ));
43        }
44
45        // Handle other errors
46        Err(cause) => {
47            tracing::error!(?cause, "failed to get pdf file info");
48            return Err(ProcessingError::ReadPdfInfo(cause));
49        }
50    };
51
52    let page_count = pdf_info
53        .pages()
54        .ok_or_else(|| {
55            ProcessingError::MalformedFile("failed to determine page count".to_string())
56        })?
57        .map_err(|err| {
58            ProcessingError::MalformedFile(format!(
59                "failed to convert pages number to integer: {err}"
60            ))
61        })?;
62
63    // For processing the pdf file must have minimum 1 page
64    if page_count < 1 {
65        tracing::debug!("skipping processing on pdf with no pages");
66        return Ok(ProcessingOutput::default());
67    }
68
69    tracing::debug!("generating file thumbnails & extracting text content");
70
71    let text_args = PdfTextArgs::default();
72
73    // Extract pdf text
74    let pages_text_future = text_all_pages_split(file_bytes, &text_args)
75        // Match outer result type with inner type
76        .map_err(ProcessingError::ExtractFileText);
77
78    // Generate pdf thumbnails
79    let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
80        .map_err(ProcessingError::GenerateThumbnail);
81
82    let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;
83
84    // Create a combined text content using the PDF page end character
85    let page_end = PAGE_END_CHARACTER.to_string();
86    let combined_text_content = pages.join(&page_end).as_bytes().to_vec();
87
88    let index_metadata = ProcessingIndexMetadata {
89        pages: Some(
90            pages
91                .into_iter()
92                .enumerate()
93                .map(|(page, content)| DocumentPage {
94                    page: page as u64,
95                    content,
96                })
97                .collect(),
98        ),
99    };
100
101    let upload_queue = vec![
102        QueuedUpload::new(
103            mime::IMAGE_JPEG,
104            GeneratedFileType::CoverPage,
105            generated.cover_page_jpeg.into(),
106        ),
107        QueuedUpload::new(
108            mime::IMAGE_JPEG,
109            GeneratedFileType::LargeThumbnail,
110            generated.large_thumbnail_jpeg.into(),
111        ),
112        QueuedUpload::new(
113            mime::IMAGE_JPEG,
114            GeneratedFileType::SmallThumbnail,
115            generated.thumbnail_jpeg.into(),
116        ),
117        QueuedUpload::new(
118            mime::TEXT_PLAIN,
119            GeneratedFileType::TextContent,
120            combined_text_content.into(),
121        ),
122    ];
123
124    Ok(ProcessingOutput {
125        encrypted: false,
126        additional_files: Default::default(),
127        index_metadata: Some(index_metadata),
128        upload_queue,
129    })
130}
131
132#[inline]
133pub fn is_pdf_file(mime: &Mime) -> bool {
134    if mime.eq(&mime::APPLICATION_PDF) {
135        return true;
136    }
137
138    // Some outdated clients use application/x-pdf for pdfs
139    if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
140        return true;
141    }
142
143    false
144}
145
146/// Renders the cover page for a PDF file
147async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> anyhow::Result<DynamicImage> {
148    let args = RenderArgs::default();
149
150    // Render the pdf cover page
151    let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args)
152        .await
153        .context("failed to render pdf page")?;
154
155    Ok(page)
156}
157
158pub struct GeneratedPdfImages {
159    /// Rendered full sized first page
160    pub cover_page_jpeg: Vec<u8>,
161    /// Small 64x64 file thumbnail
162    pub thumbnail_jpeg: Vec<u8>,
163    /// Smaller 385x385 version of first page
164    /// (Not actually 385x385 fits whatever the image aspect ratio inside those dimensions)
165    pub large_thumbnail_jpeg: Vec<u8>,
166}
167
168async fn generate_pdf_images_async(
169    pdf_info: &PdfInfo,
170    pdf: &[u8],
171) -> anyhow::Result<GeneratedPdfImages> {
172    tracing::debug!("rendering pdf cover");
173    let page = render_pdf_cover(pdf_info, pdf).await?;
174
175    tracing::debug!("rendering pdf image variants");
176    tokio::task::spawn_blocking(move || generate_pdf_images_variants(page))
177        .await
178        .context("failed to process image preview")
179        .and_then(|value| value)
180}
181
182/// Generates the various versions of the PDF cover images
183fn generate_pdf_images_variants(cover_page: DynamicImage) -> anyhow::Result<GeneratedPdfImages> {
184    tracing::debug!("rendering pdf image variants");
185    let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;
186
187    let thumbnail_jpeg = {
188        let thumbnail = cover_page.thumbnail(64, 64);
189        create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
190    };
191
192    let large_thumbnail_jpeg = {
193        let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
194        create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
195    };
196
197    Ok(GeneratedPdfImages {
198        cover_page_jpeg,
199        thumbnail_jpeg,
200        large_thumbnail_jpeg,
201    })
202}