1#![forbid(unsafe_code)]
2
3use std::{num::ParseIntError, str::FromStr, time::Duration};
4
5use crate::{
6 email::{EmailProcessingError, is_mail_mime, process_email},
7 image::process_image_async,
8 office::{PdfConvertError, process_office},
9 pdf::{GeneratePdfImagesError, process_pdf},
10};
11use ::image::{ImageError, ImageFormat};
12use bytes::Bytes;
13use docbox_database::models::{file::FileId, generated_file::GeneratedFileType};
14use docbox_search::models::DocumentPage;
15use mime::Mime;
16use office::OfficeProcessingLayer;
17use pdf::is_pdf_file;
18use pdf_process::{PdfInfoError, PdfTextError};
19use serde::{Deserialize, Serialize};
20use thiserror::Error;
21use tokio::task::JoinError;
22use utoipa::ToSchema;
23
24pub mod email;
25pub mod html_to_text;
26pub mod image;
27pub mod office;
28pub mod pdf;
29
30#[derive(Debug, Error)]
31pub enum ProcessingError {
32 #[error("file is invalid or malformed: {0}")]
34 MalformedFile(String),
35
36 #[error("failed to convert file")]
38 ConvertFile(#[from] PdfConvertError),
39
40 #[error("failed to read pdf info")]
42 ReadPdfInfo(PdfInfoError),
43
44 #[error("failed to extract pdf file text: {0}")]
46 ExtractFileText(PdfTextError),
47
48 #[error("failed to decode image file: {0}")]
50 DecodeImage(ImageError),
51
52 #[error("failed to generate file thumbnail: {0}")]
54 GenerateThumbnail(ImageError),
55
56 #[error("failed to generate pdf file thumbnail: {0}")]
58 GeneratePdfThumbnail(GeneratePdfImagesError),
59
60 #[error("failed to process email file: {0}")]
62 Email(#[from] EmailProcessingError),
63
64 #[error("error waiting for image processing")]
66 Threading(#[from] JoinError),
67}
68
69#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
70#[serde(default)]
71pub struct ProcessingConfig {
72 pub email: Option<EmailProcessingConfig>,
74
75 pub max_unpack_iterations: Option<usize>,
81}
82
83#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
84#[serde(default)]
85pub struct EmailProcessingConfig {
86 pub skip_attachments: Option<bool>,
88}
89
90#[derive(Debug)]
91pub struct QueuedUpload {
92 pub mime: Mime,
93 pub ty: GeneratedFileType,
94 pub bytes: Bytes,
95}
96
97impl QueuedUpload {
98 pub fn new(mime: Mime, ty: GeneratedFileType, bytes: Bytes) -> Self {
99 Self { mime, ty, bytes }
100 }
101}
102
103#[derive(Debug)]
106pub struct AdditionalProcessingFile {
107 pub fixed_id: Option<FileId>,
109 pub name: String,
111 pub mime: Mime,
113 pub bytes: Bytes,
115}
116
117#[derive(Debug, Default)]
118pub struct ProcessingOutput {
119 pub upload_queue: Vec<QueuedUpload>,
121
122 pub additional_files: Vec<AdditionalProcessingFile>,
125
126 pub index_metadata: Option<ProcessingIndexMetadata>,
128
129 pub encrypted: bool,
131}
132
133#[derive(Debug, Default)]
134pub struct ProcessingIndexMetadata {
135 pub pages: Option<Vec<DocumentPage>>,
137}
138
139#[derive(Clone)]
140pub struct ProcessingLayer {
141 pub office: OfficeProcessingLayer,
142 pub config: ProcessingLayerConfig,
143}
144
145#[derive(Debug, Default, Deserialize, Serialize, Clone)]
146pub struct ProcessingLayerConfig {
147 pub max_unpack_iterations: Option<usize>,
157
158 pub process_timeout: Option<Duration>,
164}
165
166pub const DEFAULT_PROCESS_TIMEOUT: Duration = Duration::from_secs(300);
167
168#[derive(Debug, Error)]
169pub enum ProcessingLayerConfigError {
170 #[error("invalid DOCBOX_MAX_FILE_UNPACK_ITERATIONS value must be a number")]
172 InvalidMaxIterations(ParseIntError),
173 #[error("DOCBOX_FILE_PROCESSING_TIMEOUT must be a number in seconds")]
175 InvalidProcessTimeout(<u64 as FromStr>::Err),
176}
177
178impl ProcessingLayerConfig {
179 pub fn from_env() -> Result<ProcessingLayerConfig, ProcessingLayerConfigError> {
180 let max_unpack_iterations = std::env::var("DOCBOX_MAX_FILE_UNPACK_ITERATIONS")
181 .ok()
182 .map(|value| {
183 value
184 .parse::<usize>()
185 .map_err(ProcessingLayerConfigError::InvalidMaxIterations)
186 })
187 .transpose()?;
188
189 let process_timeout = std::env::var("DOCBOX_FILE_PROCESSING_TIMEOUT")
190 .ok()
191 .map(|process_timeout| {
192 process_timeout
193 .parse::<u64>()
194 .map_err(ProcessingLayerConfigError::InvalidProcessTimeout)
195 .map(Duration::from_secs)
196 })
197 .transpose()?;
198
199 Ok(ProcessingLayerConfig {
200 max_unpack_iterations,
201 process_timeout,
202 })
203 }
204}
205
206pub async fn process_file(
214 config: &Option<ProcessingConfig>,
215 layer: &ProcessingLayer,
216 bytes: Bytes,
217 mime: &Mime,
218) -> Result<Option<ProcessingOutput>, ProcessingError> {
219 if is_pdf_file(mime) {
221 tracing::debug!("processing pdf file");
222
223 let output = process_pdf(&bytes).await?;
224 Ok(Some(output))
225 }
226 else if layer.office.converter.is_convertable(mime) {
228 tracing::debug!("processing office compatible file");
229
230 let output = process_office(&layer.office, bytes).await?;
231 Ok(Some(output))
232 }
233 else if is_mail_mime(mime) {
235 tracing::debug!("processing email file");
236
237 let output = process_email(config, &bytes)?;
238 Ok(Some(output))
239 }
240 else if let Some(image_format) = ImageFormat::from_mime_type(mime) {
242 tracing::debug!("processing image file");
243
244 let output = process_image_async(bytes, image_format).await?;
245 Ok(Some(output))
246 }
247 else {
249 Ok(None)
250 }
251}