docbox_processing/
lib.rs

1#![forbid(unsafe_code)]
2
3use std::{num::ParseIntError, str::FromStr, time::Duration};
4
5use crate::{
6    email::{EmailProcessingError, is_mail_mime, process_email},
7    image::process_image_async,
8    office::{PdfConvertError, process_office},
9    pdf::{GeneratePdfImagesError, process_pdf},
10};
11use ::image::{ImageError, ImageFormat};
12use bytes::Bytes;
13use docbox_database::models::{file::FileId, generated_file::GeneratedFileType};
14use docbox_search::models::DocumentPage;
15use mime::Mime;
16use office::OfficeProcessingLayer;
17use pdf::is_pdf_file;
18use pdf_process::{PdfInfoError, PdfTextError};
19use serde::{Deserialize, Serialize};
20use thiserror::Error;
21use tokio::task::JoinError;
22use utoipa::ToSchema;
23
24pub mod email;
25pub mod html_to_text;
26pub mod image;
27pub mod office;
28pub mod pdf;
29
30#[derive(Debug, Error)]
31pub enum ProcessingError {
32    /// Uploaded file is malformed or unprocessable
33    #[error("file is invalid or malformed: {0}")]
34    MalformedFile(String),
35
36    /// Failed to convert file to pdf
37    #[error("failed to convert file")]
38    ConvertFile(#[from] PdfConvertError),
39
40    /// Failed to read info about pdf file
41    #[error("failed to read pdf info")]
42    ReadPdfInfo(PdfInfoError),
43
44    /// Failed to extract text from pdf file
45    #[error("failed to extract pdf file text: {0}")]
46    ExtractFileText(PdfTextError),
47
48    /// Failed to decode an image to generate thumbnails
49    #[error("failed to decode image file: {0}")]
50    DecodeImage(ImageError),
51
52    /// Failed to generate thumbnail from pdf file
53    #[error("failed to generate file thumbnail: {0}")]
54    GenerateThumbnail(ImageError),
55
56    /// Failed to generate thumbnail from pdf file
57    #[error("failed to generate pdf file thumbnail: {0}")]
58    GeneratePdfThumbnail(GeneratePdfImagesError),
59
60    /// Failed to process an email file
61    #[error("failed to process email file: {0}")]
62    Email(#[from] EmailProcessingError),
63
64    /// Failed to join the image processing thread output
65    #[error("error waiting for image processing")]
66    Threading(#[from] JoinError),
67}
68
69#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
70#[serde(default)]
71pub struct ProcessingConfig {
72    /// Email specific processing configuration
73    pub email: Option<EmailProcessingConfig>,
74
75    /// Maximum number of times to unpack a file. When unpacking
76    /// things like email attachments, these are recursively this
77    /// limits the number of nested unpacking that can occur.
78    ///
79    /// Default: 1 (Unpack Only the immediate children)
80    pub max_unpack_iterations: Option<usize>,
81}
82
83#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
84#[serde(default)]
85pub struct EmailProcessingConfig {
86    /// Whether to skip extracting attachments when processing an email
87    pub skip_attachments: Option<bool>,
88}
89
90#[derive(Debug)]
91pub struct QueuedUpload {
92    pub mime: Mime,
93    pub ty: GeneratedFileType,
94    pub bytes: Bytes,
95}
96
97impl QueuedUpload {
98    pub fn new(mime: Mime, ty: GeneratedFileType, bytes: Bytes) -> Self {
99        Self { mime, ty, bytes }
100    }
101}
102
103/// Represents a file that should be created and processed as the
104/// output of processing a file
105#[derive(Debug)]
106pub struct AdditionalProcessingFile {
107    /// Specify a fixed ID to use for the processed file output
108    pub fixed_id: Option<FileId>,
109    /// Name of the file
110    pub name: String,
111    /// Mime type of the file to process
112    pub mime: Mime,
113    /// Bytes of the file
114    pub bytes: Bytes,
115}
116
117#[derive(Debug, Default)]
118pub struct ProcessingOutput {
119    /// Files that are waiting to be uploaded to S3
120    pub upload_queue: Vec<QueuedUpload>,
121
122    /// Collection of additional files that also need to be
123    /// processed
124    pub additional_files: Vec<AdditionalProcessingFile>,
125
126    /// Data that should be persisted to the search index
127    pub index_metadata: Option<ProcessingIndexMetadata>,
128
129    /// Whether the file has be detected as encrypted
130    pub encrypted: bool,
131}
132
133#[derive(Debug, Default)]
134pub struct ProcessingIndexMetadata {
135    /// Optional page text metadata extracted from the file
136    pub pages: Option<Vec<DocumentPage>>,
137}
138
139#[derive(Clone)]
140pub struct ProcessingLayer {
141    pub office: OfficeProcessingLayer,
142    pub config: ProcessingLayerConfig,
143}
144
145#[derive(Debug, Default, Deserialize, Serialize, Clone)]
146pub struct ProcessingLayerConfig {
147    /// Maximum number of times to unpack a file. When unpacking
148    /// things like email attachments, these are recursively this
149    /// limits the number of nested unpacking that can occur.
150    ///
151    /// This is the maximum allowed iterations on the server level,
152    /// requests can specify a custom amount but this amount is
153    /// capped to this value
154    ///
155    /// Default: 1 (Unpack Only the immediate children)
156    pub max_unpack_iterations: Option<usize>,
157
158    /// Maximum duration to allow the file processing to run for
159    /// the processing will be terminated if it takes longer than
160    /// this duration to run.
161    ///
162    /// Default: 300s
163    pub process_timeout: Option<Duration>,
164}
165
166pub const DEFAULT_PROCESS_TIMEOUT: Duration = Duration::from_secs(300);
167
168#[derive(Debug, Error)]
169pub enum ProcessingLayerConfigError {
170    /// Value provided for max unpack iterations was invalid
171    #[error("invalid DOCBOX_MAX_FILE_UNPACK_ITERATIONS value must be a number")]
172    InvalidMaxIterations(ParseIntError),
173    /// Invalid process timeout seconds
174    #[error("DOCBOX_FILE_PROCESSING_TIMEOUT must be a number in seconds")]
175    InvalidProcessTimeout(<u64 as FromStr>::Err),
176}
177
178impl ProcessingLayerConfig {
179    pub fn from_env() -> Result<ProcessingLayerConfig, ProcessingLayerConfigError> {
180        let max_unpack_iterations = std::env::var("DOCBOX_MAX_FILE_UNPACK_ITERATIONS")
181            .ok()
182            .map(|value| {
183                value
184                    .parse::<usize>()
185                    .map_err(ProcessingLayerConfigError::InvalidMaxIterations)
186            })
187            .transpose()?;
188
189        let process_timeout = std::env::var("DOCBOX_FILE_PROCESSING_TIMEOUT")
190            .ok()
191            .map(|process_timeout| {
192                process_timeout
193                    .parse::<u64>()
194                    .map_err(ProcessingLayerConfigError::InvalidProcessTimeout)
195                    .map(Duration::from_secs)
196            })
197            .transpose()?;
198
199        Ok(ProcessingLayerConfig {
200            max_unpack_iterations,
201            process_timeout,
202        })
203    }
204}
205
206/// Processes a file returning the generated processing output
207///
208/// # Arguments
209/// * `config` - Optional config for processing
210/// * `converter` - Converter for office files
211/// * `file_bytes` - Actual byte contents of the file
212/// * `mime` - Mime type of the file being processed
213pub async fn process_file(
214    config: &Option<ProcessingConfig>,
215    layer: &ProcessingLayer,
216    bytes: Bytes,
217    mime: &Mime,
218) -> Result<Option<ProcessingOutput>, ProcessingError> {
219    // File is a PDF
220    if is_pdf_file(mime) {
221        tracing::debug!("processing pdf file");
222
223        let output = process_pdf(&bytes).await?;
224        Ok(Some(output))
225    }
226    // File can be converted to a PDF then processed
227    else if layer.office.converter.is_convertable(mime) {
228        tracing::debug!("processing office compatible file");
229
230        let output = process_office(&layer.office, bytes).await?;
231        Ok(Some(output))
232    }
233    // File is an email
234    else if is_mail_mime(mime) {
235        tracing::debug!("processing email file");
236
237        let output = process_email(config, &bytes)?;
238        Ok(Some(output))
239    }
240    // Process image files if the file type is known and can be processed
241    else if let Some(image_format) = ImageFormat::from_mime_type(mime) {
242        tracing::debug!("processing image file");
243
244        let output = process_image_async(bytes, image_format).await?;
245        Ok(Some(output))
246    }
247    // No processing for this file type
248    else {
249        Ok(None)
250    }
251}