docbox_core/processing/
email.rs

1use crate::{
2    files::{generated::QueuedUpload, upload_file::ProcessingConfig},
3    processing::html_to_text::html_to_text,
4};
5use base64::{Engine, prelude::BASE64_STANDARD};
6use bytes::Bytes;
7use docbox_database::models::generated_file::GeneratedFileType;
8use docbox_search::models::DocumentPage;
9use mail_parser::{
10    Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
11};
12use mime::Mime;
13use serde::{Deserialize, Serialize};
14
15use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
16
17/// Checks if the provided mime is for an email
18pub fn is_mail_mime(mime: &Mime) -> bool {
19    mime.essence_str() == "message/rfc822"
20}
21
22/// JSON document version of the email metadata, extracts
23#[derive(Debug, Serialize, Deserialize)]
24pub struct EmailMetadataDocument {
25    /// Source of the email
26    pub from: EmailEntity,
27    /// Destination of the email
28    pub to: Vec<EmailEntity>,
29    /// cc'ed emails
30    pub cc: Vec<EmailEntity>,
31    /// bcc'ed emails
32    pub bcc: Vec<EmailEntity>,
33    /// Email subject line
34    pub subject: Option<String>,
35    /// Send date of the email (rfc3339)
36    pub date: Option<String>,
37    /// Optional message ID
38    pub message_id: Option<String>,
39    /// Collection of headers
40    pub headers: Vec<EmailHeader>,
41    /// List of attachments
42    pub attachments: Vec<EmailAttachment>,
43}
44
45#[derive(Debug, Serialize, Deserialize)]
46pub struct EmailAttachment {
47    /// Name of the attachment
48    pub name: String,
49    pub length: usize,
50    pub mime: String,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct EmailHeader {
55    pub name: String,
56    pub value: String,
57}
58
59/// Optional address and name combination, usually at least one part
60/// of this exists, this is used for headers like To, From, ..etc
61#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
62pub struct EmailEntity {
63    pub name: Option<String>,
64    pub address: Option<String>,
65}
66
67/// Turns a [Address] into a collection of email entities
68fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
69    let address = match address {
70        Some(value) => value,
71        None => return Vec::new(),
72    };
73
74    match address {
75        Address::List(addresses) => addresses
76            .iter()
77            .map(|value| EmailEntity {
78                address: value.address().map(|value| value.to_string()),
79                name: value.name().map(|value| value.to_string()),
80            })
81            .collect(),
82        Address::Group(groups) => groups
83            .iter()
84            .flat_map(|group| group.addresses.iter())
85            .map(|value| EmailEntity {
86                address: value.address().map(|value| value.to_string()),
87                name: value.name().map(|value| value.to_string()),
88            })
89            .collect(),
90    }
91}
92
93pub fn process_email(
94    config: &Option<ProcessingConfig>,
95    file_bytes: &[u8],
96) -> Result<ProcessingOutput, ProcessingError> {
97    let is_allowed_attachments = config
98        .as_ref()
99        // Config is nothing or
100        .is_none_or(|config| {
101            // Email config is nothing or
102            config
103                .email
104                .as_ref()
105                // Skip attachments is specified and true
106                .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
107        });
108
109    let parser = MessageParser::default();
110    let message = match parser.parse(file_bytes) {
111        Some(value) => value,
112        None => {
113            // Nothing could be extracted from the file
114            return Ok(ProcessingOutput::default());
115        }
116    };
117
118    let from = map_email_address(message.from());
119
120    let from = from
121        .first()
122        // Email must have at least one sender
123        .ok_or_else(|| {
124            ProcessingError::MalformedFile("email must have at least one sender".to_string())
125        })?
126        .clone();
127
128    let to = map_email_address(message.to());
129    let cc = map_email_address(message.cc());
130    let bcc = map_email_address(message.bcc());
131
132    let subject = message.subject().map(|value| value.to_string());
133    let date = message
134        .date()
135        // Turn the date into an ISO date
136        .map(|value| value.to_rfc3339());
137    let message_id = message.message_id().map(|value| value.to_string());
138
139    let headers: Vec<_> = message
140        .headers_raw()
141        .map(|(name, value)| EmailHeader {
142            name: name.to_string(),
143            value: value.to_string(),
144        })
145        .collect();
146
147    let mut attachments: Vec<EmailAttachment> = Vec::new();
148    let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
149
150    let text_body = message
151        .text_bodies()
152        .next()
153        .and_then(|body| body.text_contents());
154
155    // Get the HTML body
156    let mut html_body = message
157        .html_bodies()
158        .next()
159        .filter(|body| body.is_text_html())
160        .and_then(|body| body.text_contents())
161        .map(|value| value.to_string());
162
163    let text_content = match (text_body.as_ref(), html_body.as_ref()) {
164        // Clean the text content removing any HTML
165        (Some(value), _) => {
166            Some(
167                html_to_text(value)
168                    // Fallback to the email html_to_text on failure (it is infallible)
169                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
170            )
171        }
172        // Attempt extracting text content from the HTMl
173        (_, Some(value)) => {
174            Some(
175                html_to_text(value)
176                    // Fallback to the email html_to_text on failure (it is infallible)
177                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
178            )
179        }
180        _ => None,
181    };
182
183    for attachment in message.attachments() {
184        let name = match attachment.attachment_name().map(|value| value.to_string()) {
185            Some(value) => value,
186            None => {
187                tracing::warn!("ignoring attachment without name");
188                continue;
189            }
190        };
191
192        let length = attachment.len();
193        let raw_mime = match attachment
194            .content_type()
195            .map(|value| match value.subtype() {
196                Some(subtype) => format!("{}/{}", value.c_type, subtype),
197                None => format!("{}", value.c_type),
198            }) {
199            Some(value) => value,
200            None => {
201                tracing::warn!(?name, ?length, "ignoring attachment without mime type");
202                continue;
203            }
204        };
205
206        let mime: Mime = match raw_mime.parse() {
207            Ok(value) => value,
208            Err(cause) => {
209                tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
210                continue;
211            }
212        };
213
214        let is_inline = attachment
215            .content_disposition()
216            .is_some_and(|value| value.is_inline());
217
218        // For inline attachments with a content_id we inline them as base64 strings
219        // directly into the email content
220        if let (true, Some(content_id), Some(html_body)) =
221            (is_inline, attachment.content_id(), html_body.as_mut())
222        {
223            // Create a data URL for the content
224            let data = attachment.contents();
225            let base64_data = BASE64_STANDARD.encode(data);
226            let data_uri = format!("data:{raw_mime};base64,{base64_data}");
227
228            let key = format!("cid:{content_id}");
229
230            // Replace usages of the CID with the inline variant
231            let new_body = html_body.replace(&key, &data_uri);
232            *html_body = new_body;
233            continue;
234        }
235
236        attachments.push(EmailAttachment {
237            name: name.clone(),
238            length,
239            mime: raw_mime,
240        });
241
242        // Capture attachments if allowed
243        if is_allowed_attachments {
244            let bytes = attachment.contents();
245            let bytes = Bytes::copy_from_slice(bytes);
246            additional_files.push(AdditionalProcessingFile {
247                fixed_id: None,
248                name,
249                mime,
250                bytes,
251            });
252        }
253    }
254
255    let document = EmailMetadataDocument {
256        from,
257        to,
258        cc,
259        bcc,
260        subject,
261        date,
262        message_id,
263        headers,
264        attachments,
265    };
266
267    let metadata_bytes = match serde_json::to_vec(&document) {
268        Ok(value) => value,
269        Err(cause) => {
270            tracing::error!(?cause, "failed to serialize email json metadata document");
271            return Err(ProcessingError::InternalServerError);
272        }
273    };
274
275    let pages = text_content.as_ref().map(|value| {
276        vec![DocumentPage {
277            content: value.to_string(),
278            page: 0,
279        }]
280    });
281
282    let index_metadata = ProcessingIndexMetadata { pages };
283    let mut upload_queue = vec![QueuedUpload::new(
284        mime::APPLICATION_JSON,
285        GeneratedFileType::Metadata,
286        metadata_bytes.into(),
287    )];
288
289    if let Some(html_body) = html_body {
290        upload_queue.push(QueuedUpload::new(
291            mime::TEXT_HTML,
292            GeneratedFileType::HtmlContent,
293            html_body.into_bytes().into(),
294        ));
295    }
296
297    if let Some(text_body) = text_content {
298        upload_queue.push(QueuedUpload::new(
299            mime::TEXT_PLAIN,
300            GeneratedFileType::TextContent,
301            text_body.into_bytes().into(),
302        ));
303    }
304
305    Ok(ProcessingOutput {
306        encrypted: false,
307        additional_files,
308        index_metadata: Some(index_metadata),
309        upload_queue,
310    })
311}