1use crate::{
2 ProcessingError, ProcessingIndexMetadata, ProcessingOutput, QueuedUpload,
3 image::create_img_bytes,
4};
5use docbox_database::models::generated_file::GeneratedFileType;
6use docbox_search::models::DocumentPage;
7use futures::TryFutureExt;
8use image::{DynamicImage, ImageError, ImageFormat, ImageResult};
9use mime::Mime;
10use pdf_process::{
11 OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfRenderError, PdfTextArgs, RenderArgs,
12 pdf_info, render_single_page, text::PAGE_END_CHARACTER, text_all_pages_split,
13};
14use std::str::Split;
15use thiserror::Error;
16use tokio::task::JoinError;
17
18pub struct GeneratedPdfImages {
19 pub cover_page_jpeg: Vec<u8>,
21 pub thumbnail_jpeg: Vec<u8>,
23 pub large_thumbnail_jpeg: Vec<u8>,
26}
27
28#[derive(Debug, Error)]
29pub enum GeneratePdfImagesError {
30 #[error("failed to render pdf: {0}")]
32 PdfRender(#[from] PdfRenderError),
33
34 #[error("error processing image: {0}")]
36 ImageError(#[from] ImageError),
37
38 #[error("error waiting for image processing")]
40 Threading(#[from] JoinError),
41}
42
43pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
49 let pdf_info_args = PdfInfoArgs::default();
50
51 let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
53 Ok(value) => value,
54 Err(PdfInfoError::PdfEncrypted) => {
56 return Ok(ProcessingOutput {
57 encrypted: true,
58 ..Default::default()
59 });
60 }
61 Err(PdfInfoError::NotPdfFile) => {
63 return Err(ProcessingError::MalformedFile(
64 "file was not a pdf file".to_string(),
65 ));
66 }
67
68 Err(cause) => {
70 tracing::error!(?cause, "failed to get pdf file info");
71 return Err(ProcessingError::ReadPdfInfo(cause));
72 }
73 };
74
75 let page_count = pdf_info
76 .pages()
77 .ok_or_else(|| {
78 ProcessingError::MalformedFile("failed to determine page count".to_string())
79 })?
80 .map_err(|err| {
81 ProcessingError::MalformedFile(format!(
82 "failed to convert pages number to integer: {err}"
83 ))
84 })?;
85
86 if page_count < 1 {
88 tracing::debug!("skipping processing on pdf with no pages");
89 return Ok(ProcessingOutput::default());
90 }
91
92 tracing::debug!("generating file thumbnails & extracting text content");
93
94 let text_args = PdfTextArgs::default();
95
96 let pages_text_future = text_all_pages_split(file_bytes, &text_args)
98 .map_err(ProcessingError::ExtractFileText);
100
101 let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
103 .map_err(ProcessingError::GeneratePdfThumbnail);
104
105 let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;
106
107 let page_end = PAGE_END_CHARACTER.to_string();
109 let combined_text_content = pages.join(&page_end).as_bytes().to_vec();
110
111 let index_metadata = ProcessingIndexMetadata {
112 pages: Some(
113 pages
114 .into_iter()
115 .enumerate()
116 .map(|(page, content)| DocumentPage {
117 page: page as u64,
118 content,
119 })
120 .collect(),
121 ),
122 };
123
124 let upload_queue = vec![
125 QueuedUpload::new(
126 mime::IMAGE_JPEG,
127 GeneratedFileType::CoverPage,
128 generated.cover_page_jpeg.into(),
129 ),
130 QueuedUpload::new(
131 mime::IMAGE_JPEG,
132 GeneratedFileType::LargeThumbnail,
133 generated.large_thumbnail_jpeg.into(),
134 ),
135 QueuedUpload::new(
136 mime::IMAGE_JPEG,
137 GeneratedFileType::SmallThumbnail,
138 generated.thumbnail_jpeg.into(),
139 ),
140 QueuedUpload::new(
141 mime::TEXT_PLAIN,
142 GeneratedFileType::TextContent,
143 combined_text_content.into(),
144 ),
145 ];
146
147 Ok(ProcessingOutput {
148 encrypted: false,
149 additional_files: Default::default(),
150 index_metadata: Some(index_metadata),
151 upload_queue,
152 })
153}
154
155#[inline]
157pub fn is_pdf_file(mime: &Mime) -> bool {
158 if mime.eq(&mime::APPLICATION_PDF) {
159 return true;
160 }
161
162 if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
164 return true;
165 }
166
167 false
168}
169
170async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> Result<DynamicImage, PdfRenderError> {
172 let args = RenderArgs::default();
173 let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args).await?;
174
175 Ok(page)
176}
177
178async fn generate_pdf_images_async(
180 pdf_info: &PdfInfo,
181 pdf: &[u8],
182) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
183 tracing::debug!("rendering pdf cover");
184 let page = render_pdf_cover(pdf_info, pdf).await?;
185
186 tracing::debug!("rendering pdf image variants");
187 let result = generate_pdf_images_variants_async(page).await?;
188 Ok(result)
189}
190
191async fn generate_pdf_images_variants_async(
193 cover_page: DynamicImage,
194) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
195 let result =
196 tokio::task::spawn_blocking(move || generate_pdf_images_variants(cover_page)).await??;
197 Ok(result)
198}
199
200fn generate_pdf_images_variants(cover_page: DynamicImage) -> ImageResult<GeneratedPdfImages> {
202 tracing::debug!("rendering pdf image variants");
203 let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;
204
205 let thumbnail_jpeg = {
206 let thumbnail = cover_page.thumbnail(64, 64);
207 create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
208 };
209
210 let large_thumbnail_jpeg = {
211 let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
212 create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
213 };
214
215 Ok(GeneratedPdfImages {
216 cover_page_jpeg,
217 thumbnail_jpeg,
218 large_thumbnail_jpeg,
219 })
220}
221
222pub fn split_pdf_text_pages(text: &str) -> Split<'_, char> {
224 text.split(PAGE_END_CHARACTER)
225}