docbox_core/processing/
pdf.rs1use crate::{
2 files::generated::QueuedUpload,
3 processing::{
4 ProcessingError, ProcessingIndexMetadata, ProcessingOutput, image::create_img_bytes,
5 },
6};
7use anyhow::Context;
8use docbox_database::models::generated_file::GeneratedFileType;
9use docbox_search::models::DocumentPage;
10use futures::TryFutureExt;
11use image::{DynamicImage, ImageFormat};
12use mime::Mime;
13use pdf_process::{
14 OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfTextArgs, RenderArgs, pdf_info,
15 render_single_page, text_all_pages_split,
16};
17
18pub use pdf_process::text::PAGE_END_CHARACTER;
19
20pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
26 let pdf_info_args = PdfInfoArgs::default();
27
28 let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
30 Ok(value) => value,
31 Err(PdfInfoError::PdfEncrypted) => {
33 return Ok(ProcessingOutput {
34 encrypted: true,
35 ..Default::default()
36 });
37 }
38 Err(PdfInfoError::NotPdfFile) => {
40 return Err(ProcessingError::MalformedFile(
41 "file was not a pdf file".to_string(),
42 ));
43 }
44
45 Err(cause) => {
47 tracing::error!(?cause, "failed to get pdf file info");
48 return Err(ProcessingError::ReadPdfInfo(cause));
49 }
50 };
51
52 let page_count = pdf_info
53 .pages()
54 .ok_or_else(|| {
55 ProcessingError::MalformedFile("failed to determine page count".to_string())
56 })?
57 .map_err(|err| {
58 ProcessingError::MalformedFile(format!(
59 "failed to convert pages number to integer: {err}"
60 ))
61 })?;
62
63 if page_count < 1 {
65 tracing::debug!("skipping processing on pdf with no pages");
66 return Ok(ProcessingOutput::default());
67 }
68
69 tracing::debug!("generating file thumbnails & extracting text content");
70
71 let text_args = PdfTextArgs::default();
72
73 let pages_text_future = text_all_pages_split(file_bytes, &text_args)
75 .map_err(ProcessingError::ExtractFileText);
77
78 let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
80 .map_err(ProcessingError::GenerateThumbnail);
81
82 let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;
83
84 let page_end = PAGE_END_CHARACTER.to_string();
86 let combined_text_content = pages.join(&page_end).as_bytes().to_vec();
87
88 let index_metadata = ProcessingIndexMetadata {
89 pages: Some(
90 pages
91 .into_iter()
92 .enumerate()
93 .map(|(page, content)| DocumentPage {
94 page: page as u64,
95 content,
96 })
97 .collect(),
98 ),
99 };
100
101 let upload_queue = vec![
102 QueuedUpload::new(
103 mime::IMAGE_JPEG,
104 GeneratedFileType::CoverPage,
105 generated.cover_page_jpeg.into(),
106 ),
107 QueuedUpload::new(
108 mime::IMAGE_JPEG,
109 GeneratedFileType::LargeThumbnail,
110 generated.large_thumbnail_jpeg.into(),
111 ),
112 QueuedUpload::new(
113 mime::IMAGE_JPEG,
114 GeneratedFileType::SmallThumbnail,
115 generated.thumbnail_jpeg.into(),
116 ),
117 QueuedUpload::new(
118 mime::TEXT_PLAIN,
119 GeneratedFileType::TextContent,
120 combined_text_content.into(),
121 ),
122 ];
123
124 Ok(ProcessingOutput {
125 encrypted: false,
126 additional_files: Default::default(),
127 index_metadata: Some(index_metadata),
128 upload_queue,
129 })
130}
131
132#[inline]
133pub fn is_pdf_file(mime: &Mime) -> bool {
134 if mime.eq(&mime::APPLICATION_PDF) {
135 return true;
136 }
137
138 if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
140 return true;
141 }
142
143 false
144}
145
146async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> anyhow::Result<DynamicImage> {
148 let args = RenderArgs::default();
149
150 let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args)
152 .await
153 .context("failed to render pdf page")?;
154
155 Ok(page)
156}
157
158pub struct GeneratedPdfImages {
159 pub cover_page_jpeg: Vec<u8>,
161 pub thumbnail_jpeg: Vec<u8>,
163 pub large_thumbnail_jpeg: Vec<u8>,
166}
167
168async fn generate_pdf_images_async(
169 pdf_info: &PdfInfo,
170 pdf: &[u8],
171) -> anyhow::Result<GeneratedPdfImages> {
172 tracing::debug!("rendering pdf cover");
173 let page = render_pdf_cover(pdf_info, pdf).await?;
174
175 tracing::debug!("rendering pdf image variants");
176 tokio::task::spawn_blocking(move || generate_pdf_images_variants(page))
177 .await
178 .context("failed to process image preview")
179 .and_then(|value| value)
180}
181
182fn generate_pdf_images_variants(cover_page: DynamicImage) -> anyhow::Result<GeneratedPdfImages> {
184 tracing::debug!("rendering pdf image variants");
185 let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;
186
187 let thumbnail_jpeg = {
188 let thumbnail = cover_page.thumbnail(64, 64);
189 create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
190 };
191
192 let large_thumbnail_jpeg = {
193 let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
194 create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
195 };
196
197 Ok(GeneratedPdfImages {
198 cover_page_jpeg,
199 thumbnail_jpeg,
200 large_thumbnail_jpeg,
201 })
202}