docbox_processing/
lib.rs

1#![forbid(unsafe_code)]
2
3use std::num::ParseIntError;
4
5use crate::{
6    email::{EmailProcessingError, is_mail_mime, process_email},
7    image::process_image_async,
8    office::{PdfConvertError, process_office},
9    pdf::{GeneratePdfImagesError, process_pdf},
10};
11use ::image::{ImageError, ImageFormat};
12use bytes::Bytes;
13use docbox_database::models::{file::FileId, generated_file::GeneratedFileType};
14use docbox_search::models::DocumentPage;
15use mime::Mime;
16use office::OfficeProcessingLayer;
17use pdf::is_pdf_file;
18use pdf_process::{PdfInfoError, PdfTextError};
19use serde::{Deserialize, Serialize};
20use thiserror::Error;
21use tokio::task::JoinError;
22use utoipa::ToSchema;
23
24pub mod email;
25pub mod html_to_text;
26pub mod image;
27pub mod office;
28pub mod pdf;
29
30#[derive(Debug, Error)]
31pub enum ProcessingError {
32    /// Uploaded file is malformed or unprocessable
33    #[error("file is invalid or malformed: {0}")]
34    MalformedFile(String),
35
36    /// Failed to convert file to pdf
37    #[error("failed to convert file")]
38    ConvertFile(#[from] PdfConvertError),
39
40    /// Failed to read info about pdf file
41    #[error("failed to read pdf info")]
42    ReadPdfInfo(PdfInfoError),
43
44    /// Failed to extract text from pdf file
45    #[error("failed to extract pdf file text: {0}")]
46    ExtractFileText(PdfTextError),
47
48    /// Failed to decode an image to generate thumbnails
49    #[error("failed to decode image file: {0}")]
50    DecodeImage(ImageError),
51
52    /// Failed to generate thumbnail from pdf file
53    #[error("failed to generate file thumbnail: {0}")]
54    GenerateThumbnail(ImageError),
55
56    /// Failed to generate thumbnail from pdf file
57    #[error("failed to generate pdf file thumbnail: {0}")]
58    GeneratePdfThumbnail(GeneratePdfImagesError),
59
60    /// Failed to process an email file
61    #[error("failed to process email file: {0}")]
62    Email(#[from] EmailProcessingError),
63
64    /// Failed to join the image processing thread output
65    #[error("error waiting for image processing")]
66    Threading(#[from] JoinError),
67}
68
69#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
70#[serde(default)]
71pub struct ProcessingConfig {
72    /// Email specific processing configuration
73    pub email: Option<EmailProcessingConfig>,
74
75    /// Maximum number of times to unpack a file. When unpacking
76    /// things like email attachments, these are recursively this
77    /// limits the number of nested unpacking that can occur.
78    ///
79    /// Default: 1 (Unpack Only the immediate children)
80    pub max_unpack_iterations: Option<usize>,
81}
82
83#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
84#[serde(default)]
85pub struct EmailProcessingConfig {
86    /// Whether to skip extracting attachments when processing an email
87    pub skip_attachments: Option<bool>,
88}
89
90#[derive(Debug)]
91pub struct QueuedUpload {
92    pub mime: Mime,
93    pub ty: GeneratedFileType,
94    pub bytes: Bytes,
95}
96
97impl QueuedUpload {
98    pub fn new(mime: Mime, ty: GeneratedFileType, bytes: Bytes) -> Self {
99        Self { mime, ty, bytes }
100    }
101}
102
103/// Represents a file that should be created and processed as the
104/// output of processing a file
105#[derive(Debug)]
106pub struct AdditionalProcessingFile {
107    /// Specify a fixed ID to use for the processed file output
108    pub fixed_id: Option<FileId>,
109    /// Name of the file
110    pub name: String,
111    /// Mime type of the file to process
112    pub mime: Mime,
113    /// Bytes of the file
114    pub bytes: Bytes,
115}
116
117#[derive(Debug, Default)]
118pub struct ProcessingOutput {
119    /// Files that are waiting to be uploaded to S3
120    pub upload_queue: Vec<QueuedUpload>,
121
122    /// Collection of additional files that also need to be
123    /// processed
124    pub additional_files: Vec<AdditionalProcessingFile>,
125
126    /// Data that should be persisted to the search index
127    pub index_metadata: Option<ProcessingIndexMetadata>,
128
129    /// Whether the file has be detected as encrypted
130    pub encrypted: bool,
131}
132
133#[derive(Debug, Default)]
134pub struct ProcessingIndexMetadata {
135    /// Optional page text metadata extracted from the file
136    pub pages: Option<Vec<DocumentPage>>,
137}
138
139#[derive(Clone)]
140pub struct ProcessingLayer {
141    pub office: OfficeProcessingLayer,
142    pub config: ProcessingLayerConfig,
143}
144
145#[derive(Debug, Default, Deserialize, Serialize, Clone)]
146pub struct ProcessingLayerConfig {
147    /// Maximum number of times to unpack a file. When unpacking
148    /// things like email attachments, these are recursively this
149    /// limits the number of nested unpacking that can occur.
150    ///
151    /// This is the maximum allowed iterations on the server level,
152    /// requests can specify a custom amount but this amount is
153    /// capped to this value
154    ///
155    /// Default: 1 (Unpack Only the immediate children)
156    pub max_unpack_iterations: Option<usize>,
157}
158
159#[derive(Debug, Error)]
160pub enum ProcessingLayerConfigError {
161    /// Value provided for max unpack iterations was invalid
162    #[error("invalid DOCBOX_MAX_FILE_UNPACK_ITERATIONS value must be a number")]
163    InvalidMaxIterations(ParseIntError),
164}
165
166impl ProcessingLayerConfig {
167    pub fn from_env() -> Result<ProcessingLayerConfig, ProcessingLayerConfigError> {
168        let max_unpack_iterations = std::env::var("DOCBOX_MAX_FILE_UNPACK_ITERATIONS")
169            .ok()
170            .map(|value| {
171                value
172                    .parse::<usize>()
173                    .map_err(ProcessingLayerConfigError::InvalidMaxIterations)
174            })
175            .transpose()?;
176
177        Ok(ProcessingLayerConfig {
178            max_unpack_iterations,
179        })
180    }
181}
182
183/// Processes a file returning the generated processing output
184///
185/// # Arguments
186/// * `config` - Optional config for processing
187/// * `converter` - Converter for office files
188/// * `file_bytes` - Actual byte contents of the file
189/// * `mime` - Mime type of the file being processed
190pub async fn process_file(
191    config: &Option<ProcessingConfig>,
192    layer: &ProcessingLayer,
193    bytes: Bytes,
194    mime: &Mime,
195) -> Result<Option<ProcessingOutput>, ProcessingError> {
196    // File is a PDF
197    if is_pdf_file(mime) {
198        tracing::debug!("processing pdf file");
199
200        let output = process_pdf(&bytes).await?;
201        Ok(Some(output))
202    }
203    // File can be converted to a PDF then processed
204    else if layer.office.converter.is_convertable(mime) {
205        tracing::debug!("processing office compatible file");
206
207        let output = process_office(&layer.office, bytes).await?;
208        Ok(Some(output))
209    }
210    // File is an email
211    else if is_mail_mime(mime) {
212        tracing::debug!("processing email file");
213
214        let output = process_email(config, &bytes)?;
215        Ok(Some(output))
216    }
217    // Process image files if the file type is known and can be processed
218    else if let Some(image_format) = ImageFormat::from_mime_type(mime) {
219        tracing::debug!("processing image file");
220
221        let output = process_image_async(bytes, image_format).await?;
222        Ok(Some(output))
223    }
224    // No processing for this file type
225    else {
226        Ok(None)
227    }
228}