use crate::{
ProcessingError, ProcessingIndexMetadata, ProcessingOutput, QueuedUpload,
image::create_img_bytes,
};
use docbox_database::models::generated_file::GeneratedFileType;
use docbox_search::models::DocumentPage;
use futures::TryFutureExt;
use image::{DynamicImage, ImageError, ImageFormat, ImageResult};
use mime::Mime;
use pdf_process::{
OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfRenderError, PdfTextArgs, RenderArgs,
pdf_info, render_single_page, text::PAGE_END_CHARACTER, text_all_pages_split,
};
use std::str::Split;
use thiserror::Error;
use tokio::task::JoinError;
pub struct GeneratedPdfImages {
pub cover_page_jpeg: Vec<u8>,
pub thumbnail_jpeg: Vec<u8>,
pub large_thumbnail_jpeg: Vec<u8>,
}
#[derive(Debug, Error)]
pub enum GeneratePdfImagesError {
#[error("failed to render pdf: {0}")]
PdfRender(#[from] PdfRenderError),
#[error("error processing image: {0}")]
ImageError(#[from] ImageError),
#[error("error waiting for image processing")]
Threading(#[from] JoinError),
}
pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
let pdf_info_args = PdfInfoArgs::default();
let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
Ok(value) => value,
Err(PdfInfoError::PdfEncrypted) => {
return Ok(ProcessingOutput {
encrypted: true,
..Default::default()
});
}
Err(PdfInfoError::NotPdfFile) => {
return Err(ProcessingError::MalformedFile(
"file was not a pdf file".to_string(),
));
}
Err(error) => {
tracing::error!(?error, "failed to get pdf file info");
return Err(ProcessingError::ReadPdfInfo(error));
}
};
let page_count = pdf_info
.pages()
.ok_or_else(|| {
ProcessingError::MalformedFile("failed to determine page count".to_string())
})?
.map_err(|err| {
ProcessingError::MalformedFile(format!(
"failed to convert pages number to integer: {err}"
))
})?;
if page_count < 1 {
tracing::debug!("skipping processing on pdf with no pages");
return Ok(ProcessingOutput::default());
}
tracing::debug!("generating file thumbnails & extracting text content");
let text_args = PdfTextArgs::default();
let pages_text_future = text_all_pages_split(file_bytes, &text_args)
.map_err(ProcessingError::ExtractFileText);
let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
.map_err(ProcessingError::GeneratePdfThumbnail);
let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;
let page_end = PAGE_END_CHARACTER.to_string();
let combined_text_content = pages.join(&page_end).as_bytes().to_vec();
let index_metadata = ProcessingIndexMetadata {
pages: Some(
pages
.into_iter()
.enumerate()
.map(|(page, content)| DocumentPage {
page: page as u64,
content,
})
.collect(),
),
};
let upload_queue = vec![
QueuedUpload::new(
mime::IMAGE_JPEG,
GeneratedFileType::CoverPage,
generated.cover_page_jpeg.into(),
),
QueuedUpload::new(
mime::IMAGE_JPEG,
GeneratedFileType::LargeThumbnail,
generated.large_thumbnail_jpeg.into(),
),
QueuedUpload::new(
mime::IMAGE_JPEG,
GeneratedFileType::SmallThumbnail,
generated.thumbnail_jpeg.into(),
),
QueuedUpload::new(
mime::TEXT_PLAIN,
GeneratedFileType::TextContent,
combined_text_content.into(),
),
];
Ok(ProcessingOutput {
encrypted: false,
additional_files: Default::default(),
index_metadata: Some(index_metadata),
upload_queue,
})
}
#[inline]
pub fn is_pdf_file(mime: &Mime) -> bool {
if mime.eq(&mime::APPLICATION_PDF) {
return true;
}
if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
return true;
}
false
}
async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> Result<DynamicImage, PdfRenderError> {
let args = RenderArgs::default();
let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args).await?;
Ok(page)
}
async fn generate_pdf_images_async(
pdf_info: &PdfInfo,
pdf: &[u8],
) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
tracing::debug!("rendering pdf cover");
let page = render_pdf_cover(pdf_info, pdf).await?;
tracing::debug!("rendering pdf image variants");
let result = generate_pdf_images_variants_async(page).await?;
Ok(result)
}
async fn generate_pdf_images_variants_async(
cover_page: DynamicImage,
) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
let result =
tokio::task::spawn_blocking(move || generate_pdf_images_variants(cover_page)).await??;
Ok(result)
}
fn generate_pdf_images_variants(cover_page: DynamicImage) -> ImageResult<GeneratedPdfImages> {
tracing::debug!("rendering pdf image variants");
let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;
let thumbnail_jpeg = {
let thumbnail = cover_page.thumbnail(64, 64);
create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
};
let large_thumbnail_jpeg = {
let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
};
Ok(GeneratedPdfImages {
cover_page_jpeg,
thumbnail_jpeg,
large_thumbnail_jpeg,
})
}
pub fn split_pdf_text_pages(text: &str) -> Split<'_, char> {
text.split(PAGE_END_CHARACTER)
}