docbox-processing 0.7.2

Docbox file processing logic
Documentation
use crate::{
    ProcessingError, ProcessingIndexMetadata, ProcessingOutput, QueuedUpload,
    image::create_img_bytes,
};
use docbox_database::models::generated_file::GeneratedFileType;
use docbox_search::models::DocumentPage;
use futures::TryFutureExt;
use image::{DynamicImage, ImageError, ImageFormat, ImageResult};
use mime::Mime;
use pdf_process::{
    OutputFormat, PdfInfo, PdfInfoArgs, PdfInfoError, PdfRenderError, PdfTextArgs, RenderArgs,
    pdf_info, render_single_page, text::PAGE_END_CHARACTER, text_all_pages_split,
};
use std::str::Split;
use thiserror::Error;
use tokio::task::JoinError;

pub struct GeneratedPdfImages {
    /// Rendered full sized first page
    pub cover_page_jpeg: Vec<u8>,
    /// Small 64x64 file thumbnail
    pub thumbnail_jpeg: Vec<u8>,
    /// Smaller 385x385 version of first page
    /// (Not actually 385x385 fits whatever the image aspect ratio inside those dimensions)
    pub large_thumbnail_jpeg: Vec<u8>,
}

#[derive(Debug, Error)]
pub enum GeneratePdfImagesError {
    /// PDF rendering error
    #[error("failed to render pdf: {0}")]
    PdfRender(#[from] PdfRenderError),

    /// Image processing error
    #[error("error processing image: {0}")]
    ImageError(#[from] ImageError),

    /// Failed to join the image processing thread output
    #[error("error waiting for image processing")]
    Threading(#[from] JoinError),
}

/// Processes a PDF compatible file producing index data and generated files such as
/// thumbnails and a converted pdf version
///
/// Extracts text from the PDF and creates multiple thumbnail preview images
/// of the first page at various sizes
pub async fn process_pdf(file_bytes: &[u8]) -> Result<ProcessingOutput, ProcessingError> {
    let pdf_info_args = PdfInfoArgs::default();

    // Load the pdf information
    let pdf_info = match pdf_info(file_bytes, &pdf_info_args).await {
        Ok(value) => value,
        // Skip processing encrypted pdf files
        Err(PdfInfoError::PdfEncrypted) => {
            return Ok(ProcessingOutput {
                encrypted: true,
                ..Default::default()
            });
        }
        // Handle invalid file
        Err(PdfInfoError::NotPdfFile) => {
            return Err(ProcessingError::MalformedFile(
                "file was not a pdf file".to_string(),
            ));
        }

        // Handle other errors
        Err(error) => {
            tracing::error!(?error, "failed to get pdf file info");
            return Err(ProcessingError::ReadPdfInfo(error));
        }
    };

    let page_count = pdf_info
        .pages()
        .ok_or_else(|| {
            ProcessingError::MalformedFile("failed to determine page count".to_string())
        })?
        .map_err(|err| {
            ProcessingError::MalformedFile(format!(
                "failed to convert pages number to integer: {err}"
            ))
        })?;

    // For processing the pdf file must have minimum 1 page
    if page_count < 1 {
        tracing::debug!("skipping processing on pdf with no pages");
        return Ok(ProcessingOutput::default());
    }

    tracing::debug!("generating file thumbnails & extracting text content");

    let text_args = PdfTextArgs::default();

    // Extract pdf text
    let pages_text_future = text_all_pages_split(file_bytes, &text_args)
        // Match outer result type with inner type
        .map_err(ProcessingError::ExtractFileText);

    // Generate pdf thumbnails
    let thumbnail_future = generate_pdf_images_async(&pdf_info, file_bytes)
        .map_err(ProcessingError::GeneratePdfThumbnail);

    let (pages, generated) = tokio::try_join!(pages_text_future, thumbnail_future)?;

    // Create a combined text content using the PDF page end character
    let page_end = PAGE_END_CHARACTER.to_string();
    let combined_text_content = pages.join(&page_end).as_bytes().to_vec();

    let index_metadata = ProcessingIndexMetadata {
        pages: Some(
            pages
                .into_iter()
                .enumerate()
                .map(|(page, content)| DocumentPage {
                    page: page as u64,
                    content,
                })
                .collect(),
        ),
    };

    let upload_queue = vec![
        QueuedUpload::new(
            mime::IMAGE_JPEG,
            GeneratedFileType::CoverPage,
            generated.cover_page_jpeg.into(),
        ),
        QueuedUpload::new(
            mime::IMAGE_JPEG,
            GeneratedFileType::LargeThumbnail,
            generated.large_thumbnail_jpeg.into(),
        ),
        QueuedUpload::new(
            mime::IMAGE_JPEG,
            GeneratedFileType::SmallThumbnail,
            generated.thumbnail_jpeg.into(),
        ),
        QueuedUpload::new(
            mime::TEXT_PLAIN,
            GeneratedFileType::TextContent,
            combined_text_content.into(),
        ),
    ];

    Ok(ProcessingOutput {
        encrypted: false,
        additional_files: Default::default(),
        index_metadata: Some(index_metadata),
        upload_queue,
    })
}

/// Check if the provided mime type is for a PDF
#[inline]
pub fn is_pdf_file(mime: &Mime) -> bool {
    if mime.eq(&mime::APPLICATION_PDF) {
        return true;
    }

    // Some outdated clients use application/x-pdf for pdf
    if mime.type_() == mime::APPLICATION && mime.subtype().as_str() == "x-pdf" {
        return true;
    }

    false
}

/// Renders the cover page for a PDF file
async fn render_pdf_cover(pdf_info: &PdfInfo, pdf: &[u8]) -> Result<DynamicImage, PdfRenderError> {
    let args = RenderArgs::default();
    let page = render_single_page(pdf, pdf_info, OutputFormat::Jpeg, 1, &args).await?;

    Ok(page)
}

/// Asynchronously generate pdf cover image and its variants
async fn generate_pdf_images_async(
    pdf_info: &PdfInfo,
    pdf: &[u8],
) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
    tracing::debug!("rendering pdf cover");
    let page = render_pdf_cover(pdf_info, pdf).await?;

    tracing::debug!("rendering pdf image variants");
    let result = generate_pdf_images_variants_async(page).await?;
    Ok(result)
}

/// Async wrapper around [generate_pdf_images_variants]
async fn generate_pdf_images_variants_async(
    cover_page: DynamicImage,
) -> Result<GeneratedPdfImages, GeneratePdfImagesError> {
    let result =
        tokio::task::spawn_blocking(move || generate_pdf_images_variants(cover_page)).await??;
    Ok(result)
}

/// Generates the various versions of the PDF cover images
fn generate_pdf_images_variants(cover_page: DynamicImage) -> ImageResult<GeneratedPdfImages> {
    tracing::debug!("rendering pdf image variants");
    let cover_page_jpeg = create_img_bytes(&cover_page, ImageFormat::Jpeg)?;

    let thumbnail_jpeg = {
        let thumbnail = cover_page.thumbnail(64, 64);
        create_img_bytes(&thumbnail, ImageFormat::Jpeg)?
    };

    let large_thumbnail_jpeg = {
        let cover_page_preview = cover_page.resize(512, 512, image::imageops::FilterType::Triangle);
        create_img_bytes(&cover_page_preview, ImageFormat::Jpeg)?
    };

    Ok(GeneratedPdfImages {
        cover_page_jpeg,
        thumbnail_jpeg,
        large_thumbnail_jpeg,
    })
}

/// Split the merged text content of a PDF based on the page end character (Split by page)
pub fn split_pdf_text_pages(text: &str) -> Split<'_, char> {
    text.split(PAGE_END_CHARACTER)
}