docbox_core/processing/
mod.rs

1use crate::{
2    files::{generated::QueuedUpload, upload_file::ProcessingConfig},
3    processing::{
4        email::{is_mail_mime, process_email},
5        image::process_image_async,
6        office::{PdfConvertError, process_office},
7        pdf::process_pdf,
8    },
9};
10use ::image::{ImageError, ImageFormat};
11use bytes::Bytes;
12use docbox_database::models::file::FileId;
13use docbox_search::models::DocumentPage;
14use mime::Mime;
15use office::OfficeProcessingLayer;
16use pdf::is_pdf_file;
17use pdf_process::{PdfInfoError, PdfTextError};
18use thiserror::Error;
19use tokio::task::JoinError;
20
21pub mod email;
22pub mod html_to_text;
23pub mod image;
24pub mod office;
25pub mod pdf;
26
27#[derive(Debug, Error)]
28pub enum ProcessingError {
29    /// Uploaded file is malformed or unprocessable
30    #[error("file is invalid or malformed: {0}")]
31    MalformedFile(String),
32
33    /// Internal server error
34    #[error("internal server error")]
35    InternalServerError,
36
37    /// Failed to convert file to pdf
38    #[error("failed to convert file: {0}")]
39    ConvertFile(#[from] PdfConvertError),
40
41    /// Failed to read info about pdf file
42    #[error("failed to read pdf info: {0}")]
43    ReadPdfInfo(PdfInfoError),
44
45    /// Failed to extract text from pdf file
46    #[error("failed to extract pdf file text: {0}")]
47    ExtractFileText(PdfTextError),
48
49    /// Failed to decode an image to generate thumbnails
50    #[error("failed to decode image file: {0}")]
51    DecodeImage(ImageError),
52
53    /// Failed to generate thumbnail from pdf file
54    #[error("failed to generate file thumbnail: {0}")]
55    GenerateThumbnail(anyhow::Error),
56
57    /// Failed to join the image processing thread output
58    #[error("error waiting for image processing")]
59    Threading(#[from] JoinError),
60}
61
62/// Represents a file that should be created and processed as the
63/// output of processing a file
64#[derive(Debug)]
65pub struct AdditionalProcessingFile {
66    /// Specify a fixed ID to use for the processed file output
67    pub fixed_id: Option<FileId>,
68    /// Name of the file
69    pub name: String,
70    /// Mime type of the file to process
71    pub mime: Mime,
72    /// Bytes of the file
73    pub bytes: Bytes,
74}
75
76#[derive(Debug, Default)]
77pub struct ProcessingOutput {
78    /// Files that are waiting to be uploaded to S3
79    pub upload_queue: Vec<QueuedUpload>,
80
81    /// Collection of additional files that also need to be
82    /// processed
83    pub additional_files: Vec<AdditionalProcessingFile>,
84
85    /// Data that should be persisted to the search index
86    pub index_metadata: Option<ProcessingIndexMetadata>,
87
88    /// Whether the file has be detected as encrypted
89    pub encrypted: bool,
90}
91
92#[derive(Debug, Default)]
93pub struct ProcessingIndexMetadata {
94    /// Optional page text metadata extracted from the file
95    pub pages: Option<Vec<DocumentPage>>,
96}
97
98#[derive(Clone)]
99pub struct ProcessingLayer {
100    pub office: OfficeProcessingLayer,
101}
102
103/// Processes a file returning the generated processing output
104///
105/// # Arguments
106/// * `config` - Optional config for processing
107/// * `converter` - Converter for office files
108/// * `file_bytes` - Actual byte contents of the file
109/// * `mime` - Mime type of the file being processed
110pub async fn process_file(
111    config: &Option<ProcessingConfig>,
112    layer: &ProcessingLayer,
113    bytes: Bytes,
114    mime: &Mime,
115) -> Result<Option<ProcessingOutput>, ProcessingError> {
116    // File is a PDF
117    if is_pdf_file(mime) {
118        tracing::debug!("processing pdf file");
119
120        let output = process_pdf(&bytes).await?;
121        Ok(Some(output))
122    }
123    // File can be converted to a PDF then processed
124    else if layer.office.converter.is_convertable(mime) {
125        tracing::debug!("processing office compatible file");
126
127        let output = process_office(&layer.office, bytes).await?;
128        Ok(Some(output))
129    }
130    // File is an email
131    else if is_mail_mime(mime) {
132        tracing::debug!("processing email file");
133
134        let output = process_email(config, &bytes)?;
135        Ok(Some(output))
136    }
137    // Process image files if the file type is known and can be processed
138    else if let Some(image_format) = ImageFormat::from_mime_type(mime) {
139        tracing::debug!("processing image file");
140
141        let output = process_image_async(bytes, image_format).await?;
142        Ok(Some(output))
143    }
144    // No processing for this file type
145    else {
146        return Ok(None);
147    }
148}