#![forbid(unsafe_code)]
use std::{num::ParseIntError, str::FromStr, time::Duration};
use crate::{
email::{EmailProcessingError, is_mail_mime, process_email},
image::process_image_async,
office::{PdfConvertError, process_office},
pdf::{GeneratePdfImagesError, process_pdf},
};
use ::image::{ImageError, ImageFormat};
use bytes::Bytes;
use docbox_database::models::{file::FileId, generated_file::GeneratedFileType};
use docbox_search::models::DocumentPage;
use mime::Mime;
use office::OfficeProcessingLayer;
use pdf::is_pdf_file;
use pdf_process::{PdfInfoError, PdfTextError};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tokio::task::JoinError;
use utoipa::ToSchema;
pub mod email;
pub mod html_to_text;
pub mod image;
pub mod office;
pub mod pdf;
#[derive(Debug, Error)]
pub enum ProcessingError {
#[error("file is invalid or malformed: {0}")]
MalformedFile(String),
#[error("failed to convert file")]
ConvertFile(#[from] PdfConvertError),
#[error("failed to read pdf info")]
ReadPdfInfo(PdfInfoError),
#[error("failed to extract pdf file text: {0}")]
ExtractFileText(PdfTextError),
#[error("failed to decode image file: {0}")]
DecodeImage(ImageError),
#[error("failed to generate file thumbnail: {0}")]
GenerateThumbnail(ImageError),
#[error("failed to generate pdf file thumbnail: {0}")]
GeneratePdfThumbnail(GeneratePdfImagesError),
#[error("failed to process email file: {0}")]
Email(#[from] EmailProcessingError),
#[error("error waiting for image processing")]
Threading(#[from] JoinError),
}
#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
#[serde(default)]
pub struct ProcessingConfig {
pub email: Option<EmailProcessingConfig>,
pub max_unpack_iterations: Option<usize>,
}
#[derive(Debug, Default, Clone, Deserialize, Serialize, ToSchema)]
#[serde(default)]
pub struct EmailProcessingConfig {
pub skip_attachments: Option<bool>,
}
#[derive(Debug)]
pub struct QueuedUpload {
pub mime: Mime,
pub ty: GeneratedFileType,
pub bytes: Bytes,
}
impl QueuedUpload {
pub fn new(mime: Mime, ty: GeneratedFileType, bytes: Bytes) -> Self {
Self { mime, ty, bytes }
}
}
#[derive(Debug)]
pub struct AdditionalProcessingFile {
pub fixed_id: Option<FileId>,
pub name: String,
pub mime: Mime,
pub bytes: Bytes,
}
#[derive(Debug, Default)]
pub struct ProcessingOutput {
pub upload_queue: Vec<QueuedUpload>,
pub additional_files: Vec<AdditionalProcessingFile>,
pub index_metadata: Option<ProcessingIndexMetadata>,
pub encrypted: bool,
}
#[derive(Debug, Default)]
pub struct ProcessingIndexMetadata {
pub pages: Option<Vec<DocumentPage>>,
}
#[derive(Clone)]
pub struct ProcessingLayer {
pub office: OfficeProcessingLayer,
pub config: ProcessingLayerConfig,
}
#[derive(Debug, Default, Deserialize, Serialize, Clone)]
pub struct ProcessingLayerConfig {
pub max_unpack_iterations: Option<usize>,
pub process_timeout: Option<Duration>,
}
pub const DEFAULT_PROCESS_TIMEOUT: Duration = Duration::from_secs(300);
#[derive(Debug, Error)]
pub enum ProcessingLayerConfigError {
#[error("invalid DOCBOX_MAX_FILE_UNPACK_ITERATIONS value must be a number")]
InvalidMaxIterations(ParseIntError),
#[error("DOCBOX_FILE_PROCESSING_TIMEOUT must be a number in seconds")]
InvalidProcessTimeout(<u64 as FromStr>::Err),
}
impl ProcessingLayerConfig {
pub fn from_env() -> Result<ProcessingLayerConfig, ProcessingLayerConfigError> {
let max_unpack_iterations = std::env::var("DOCBOX_MAX_FILE_UNPACK_ITERATIONS")
.ok()
.map(|value| {
value
.parse::<usize>()
.map_err(ProcessingLayerConfigError::InvalidMaxIterations)
})
.transpose()?;
let process_timeout = std::env::var("DOCBOX_FILE_PROCESSING_TIMEOUT")
.ok()
.map(|process_timeout| {
process_timeout
.parse::<u64>()
.map_err(ProcessingLayerConfigError::InvalidProcessTimeout)
.map(Duration::from_secs)
})
.transpose()?;
Ok(ProcessingLayerConfig {
max_unpack_iterations,
process_timeout,
})
}
}
pub async fn process_file(
config: &Option<ProcessingConfig>,
layer: &ProcessingLayer,
bytes: Bytes,
mime: &Mime,
) -> Result<Option<ProcessingOutput>, ProcessingError> {
if is_pdf_file(mime) {
tracing::debug!("processing pdf file");
let output = process_pdf(&bytes).await?;
Ok(Some(output))
}
else if layer.office.converter.is_convertable(mime) {
tracing::debug!("processing office compatible file");
let output = process_office(&layer.office, bytes).await?;
Ok(Some(output))
}
else if is_mail_mime(mime) {
tracing::debug!("processing email file");
let output = process_email(config, &bytes)?;
Ok(Some(output))
}
else if let Some(image_format) = ImageFormat::from_mime_type(mime) {
tracing::debug!("processing image file");
let output = process_image_async(bytes, image_format).await?;
Ok(Some(output))
}
else {
Ok(None)
}
}