docbox_processing/
email.rs

1use crate::{ProcessingConfig, QueuedUpload, html_to_text::html_to_text};
2use base64::{Engine, prelude::BASE64_STANDARD};
3use bytes::Bytes;
4use docbox_database::models::generated_file::GeneratedFileType;
5use docbox_search::models::DocumentPage;
6use mail_parser::{
7    Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
8};
9use mime::Mime;
10use serde::{Deserialize, Serialize};
11use thiserror::Error;
12
13use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
14
15#[derive(Debug, Error)]
16pub enum EmailProcessingError {
17    /// Email is missing at least one From: header
18    #[error("email must have at least one sender")]
19    MissingSender,
20
21    /// JSON serialization failure for metadata storage
22    #[error("failed to serialize email metadata")]
23    MetadataSerialize(serde_json::Error),
24}
25
26/// Checks if the provided mime is for an email
27pub fn is_mail_mime(mime: &Mime) -> bool {
28    mime.essence_str() == "message/rfc822"
29}
30
31/// JSON document version of the email metadata, extracts
32#[derive(Debug, Serialize, Deserialize)]
33pub struct EmailMetadataDocument {
34    /// Source of the email
35    pub from: EmailEntity,
36    /// Destination of the email
37    pub to: Vec<EmailEntity>,
38    /// cc'ed emails
39    pub cc: Vec<EmailEntity>,
40    /// bcc'ed emails
41    pub bcc: Vec<EmailEntity>,
42    /// Email subject line
43    pub subject: Option<String>,
44    /// Send date of the email (rfc3339)
45    pub date: Option<String>,
46    /// Optional message ID
47    pub message_id: Option<String>,
48    /// Collection of headers
49    pub headers: Vec<EmailHeader>,
50    /// List of attachments
51    pub attachments: Vec<EmailAttachment>,
52}
53
54#[derive(Debug, Serialize, Deserialize)]
55pub struct EmailAttachment {
56    /// Name of the attachment
57    pub name: String,
58    pub length: usize,
59    pub mime: String,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct EmailHeader {
64    pub name: String,
65    pub value: String,
66}
67
68/// Optional address and name combination, usually at least one part
69/// of this exists, this is used for headers like To, From, ..etc
70#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
71pub struct EmailEntity {
72    pub name: Option<String>,
73    pub address: Option<String>,
74}
75
76/// Turns a [Address] into a collection of email entities
77fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
78    let address = match address {
79        Some(value) => value,
80        None => return Vec::new(),
81    };
82
83    match address {
84        Address::List(addresses) => addresses
85            .iter()
86            .map(|value| EmailEntity {
87                address: value.address().map(|value| value.to_string()),
88                name: value.name().map(|value| value.to_string()),
89            })
90            .collect(),
91        Address::Group(groups) => groups
92            .iter()
93            .flat_map(|group| group.addresses.iter())
94            .map(|value| EmailEntity {
95                address: value.address().map(|value| value.to_string()),
96                name: value.name().map(|value| value.to_string()),
97            })
98            .collect(),
99    }
100}
101
102pub fn process_email(
103    config: &Option<ProcessingConfig>,
104    file_bytes: &[u8],
105) -> Result<ProcessingOutput, ProcessingError> {
106    let is_allowed_attachments = config
107        .as_ref()
108        // Config is nothing or
109        .is_none_or(|config| {
110            // Email config is nothing or
111            config
112                .email
113                .as_ref()
114                // Skip attachments is specified and true
115                .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
116        });
117
118    let parser = MessageParser::default();
119    let message = match parser.parse(file_bytes) {
120        Some(value) => value,
121        None => {
122            // Nothing could be extracted from the file
123            return Ok(ProcessingOutput::default());
124        }
125    };
126
127    let from = map_email_address(message.from());
128
129    let from = from
130        .first()
131        // Email must have at least one sender
132        .ok_or(EmailProcessingError::MissingSender)?
133        .clone();
134
135    let to = map_email_address(message.to());
136    let cc = map_email_address(message.cc());
137    let bcc = map_email_address(message.bcc());
138
139    let subject = message.subject().map(|value| value.to_string());
140    let date = message
141        .date()
142        // Turn the date into an ISO date
143        .map(|value| value.to_rfc3339());
144    let message_id = message.message_id().map(|value| value.to_string());
145
146    let headers: Vec<_> = message
147        .headers_raw()
148        .map(|(name, value)| EmailHeader {
149            name: name.to_string(),
150            value: value.to_string(),
151        })
152        .collect();
153
154    let mut attachments: Vec<EmailAttachment> = Vec::new();
155    let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
156
157    let text_body = message
158        .text_bodies()
159        .next()
160        .and_then(|body| body.text_contents());
161
162    // Get the HTML body
163    let mut html_body = message
164        .html_bodies()
165        .next()
166        .filter(|body| body.is_text_html())
167        .and_then(|body| body.text_contents())
168        .map(|value| value.to_string());
169
170    let text_content = match (text_body.as_ref(), html_body.as_ref()) {
171        // Clean the text content removing any HTML
172        (Some(value), _) => {
173            Some(
174                html_to_text(value)
175                    // Fallback to the email html_to_text on failure (it is infallible)
176                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
177            )
178        }
179        // Attempt extracting text content from the HTMl
180        (_, Some(value)) => {
181            Some(
182                html_to_text(value)
183                    // Fallback to the email html_to_text on failure (it is infallible)
184                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
185            )
186        }
187        _ => None,
188    };
189
190    for attachment in message.attachments() {
191        let name = match attachment.attachment_name().map(|value| value.to_string()) {
192            Some(value) => value,
193            None => {
194                tracing::warn!("ignoring attachment without name");
195                continue;
196            }
197        };
198
199        let length = attachment.len();
200        let raw_mime = match attachment
201            .content_type()
202            .map(|value| match value.subtype() {
203                Some(subtype) => format!("{}/{}", value.c_type, subtype),
204                None => format!("{}", value.c_type),
205            }) {
206            Some(value) => value,
207            None => {
208                tracing::warn!(?name, ?length, "ignoring attachment without mime type");
209                continue;
210            }
211        };
212
213        let mime: Mime = match raw_mime.parse() {
214            Ok(value) => value,
215            Err(cause) => {
216                tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
217                continue;
218            }
219        };
220
221        let is_inline = attachment
222            .content_disposition()
223            .is_some_and(|value| value.is_inline());
224
225        // For inline attachments with a content_id we inline them as base64 strings
226        // directly into the email content
227        if let (true, Some(content_id), Some(html_body)) =
228            (is_inline, attachment.content_id(), html_body.as_mut())
229        {
230            // Create a data URL for the content
231            let data = attachment.contents();
232            let base64_data = BASE64_STANDARD.encode(data);
233            let data_uri = format!("data:{raw_mime};base64,{base64_data}");
234
235            let key = format!("cid:{content_id}");
236
237            // Replace usages of the CID with the inline variant
238            let new_body = html_body.replace(&key, &data_uri);
239            *html_body = new_body;
240            continue;
241        }
242
243        attachments.push(EmailAttachment {
244            name: name.clone(),
245            length,
246            mime: raw_mime,
247        });
248
249        // Capture attachments if allowed
250        if is_allowed_attachments {
251            let bytes = attachment.contents();
252            let bytes = Bytes::copy_from_slice(bytes);
253            additional_files.push(AdditionalProcessingFile {
254                fixed_id: None,
255                name,
256                mime,
257                bytes,
258            });
259        }
260    }
261
262    let document = EmailMetadataDocument {
263        from,
264        to,
265        cc,
266        bcc,
267        subject,
268        date,
269        message_id,
270        headers,
271        attachments,
272    };
273
274    let metadata_bytes = match serde_json::to_vec(&document) {
275        Ok(value) => value,
276        Err(cause) => {
277            tracing::error!(?cause, "failed to serialize email json metadata document");
278            return Err(ProcessingError::Email(
279                EmailProcessingError::MetadataSerialize(cause),
280            ));
281        }
282    };
283
284    let pages = text_content.as_ref().map(|value| {
285        vec![DocumentPage {
286            content: value.to_string(),
287            page: 0,
288        }]
289    });
290
291    let index_metadata = ProcessingIndexMetadata { pages };
292    let mut upload_queue = vec![QueuedUpload::new(
293        mime::APPLICATION_JSON,
294        GeneratedFileType::Metadata,
295        metadata_bytes.into(),
296    )];
297
298    if let Some(html_body) = html_body {
299        upload_queue.push(QueuedUpload::new(
300            mime::TEXT_HTML,
301            GeneratedFileType::HtmlContent,
302            html_body.into_bytes().into(),
303        ));
304    }
305
306    if let Some(text_body) = text_content {
307        upload_queue.push(QueuedUpload::new(
308            mime::TEXT_PLAIN,
309            GeneratedFileType::TextContent,
310            text_body.into_bytes().into(),
311        ));
312    }
313
314    Ok(ProcessingOutput {
315        encrypted: false,
316        additional_files,
317        index_metadata: Some(index_metadata),
318        upload_queue,
319    })
320}