docbox_core/processing/
email.rs

1use crate::{
2    files::{generated::QueuedUpload, upload_file::ProcessingConfig},
3    processing::html_to_text::html_to_text,
4};
5use base64::{Engine, prelude::BASE64_STANDARD};
6use bytes::Bytes;
7use docbox_database::models::generated_file::GeneratedFileType;
8use docbox_search::models::DocumentPage;
9use mail_parser::{
10    Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
11};
12use mime::Mime;
13use serde::{Deserialize, Serialize};
14
15use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
16
17/// Checks if the provided mime is for an email
18pub fn is_mail_mime(mime: &Mime) -> bool {
19    mime.essence_str() == "message/rfc822"
20}
21
22/// JSON document version of the email metadata, extracts
23#[derive(Debug, Serialize, Deserialize)]
24pub struct EmailMetadataDocument {
25    /// Source of the email
26    pub from: EmailEntity,
27    /// Destination of the email
28    pub to: Vec<EmailEntity>,
29    /// cc'ed emails
30    pub cc: Vec<EmailEntity>,
31    /// bcc'ed emails
32    pub bcc: Vec<EmailEntity>,
33    /// Email subject line
34    pub subject: Option<String>,
35    /// Send date of the email (rfc3339)
36    pub date: Option<String>,
37    /// Optional message ID
38    pub message_id: Option<String>,
39    /// Collection of headers
40    pub headers: Vec<EmailHeader>,
41    /// List of attachments
42    pub attachments: Vec<EmailAttachment>,
43}
44
45#[derive(Debug, Serialize, Deserialize)]
46pub struct EmailAttachment {
47    /// Name of the attachment
48    pub name: String,
49    pub length: usize,
50    pub mime: String,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct EmailHeader {
55    pub name: String,
56    pub value: String,
57}
58
59/// Optional address and name combination, usually at least one part
60/// of this exists, this is used for headers like To, From, ..etc
61#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
62pub struct EmailEntity {
63    pub name: Option<String>,
64    pub address: Option<String>,
65}
66
67/// Turns a [Address] into a collection of email entities
68fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
69    let address = match address {
70        Some(value) => value,
71        None => return Vec::new(),
72    };
73
74    match address {
75        Address::List(addresses) => addresses
76            .iter()
77            .map(|value| EmailEntity {
78                address: value.address().map(|value| value.to_string()),
79                name: value.name().map(|value| value.to_string()),
80            })
81            .collect(),
82        Address::Group(groups) => groups
83            .iter()
84            .flat_map(|group| group.addresses.iter())
85            .map(|value| EmailEntity {
86                address: value.address().map(|value| value.to_string()),
87                name: value.name().map(|value| value.to_string()),
88            })
89            .collect(),
90    }
91}
92
93pub fn process_email(
94    config: &Option<ProcessingConfig>,
95    file_bytes: &[u8],
96) -> Result<ProcessingOutput, ProcessingError> {
97    let is_allowed_attachments = config
98        .as_ref()
99        // Config is nothing or
100        .is_none_or(|config| {
101            // Email config is nothing or
102            config
103                .email
104                .as_ref()
105                // Skip attachments is specified and true
106                .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
107        });
108
109    let parser = MessageParser::default();
110    let message = match parser.parse(file_bytes) {
111        Some(value) => value,
112        None => {
113            // Nothing could be extracted from the file
114            return Ok(ProcessingOutput::default());
115        }
116    };
117
118    let from = map_email_address(message.from());
119
120    let from = from
121        .first()
122        // Email must have at least one sender
123        .ok_or_else(|| {
124            ProcessingError::MalformedFile("email must have at least one sender".to_string())
125        })?
126        .clone();
127
128    let to = map_email_address(message.to());
129    let cc = map_email_address(message.cc());
130    let bcc = map_email_address(message.bcc());
131
132    let subject = message.subject().map(|value| value.to_string());
133    let date = message
134        .date()
135        // Turn the date into an ISO date
136        .map(|value| value.to_rfc3339());
137    let message_id = message.message_id().map(|value| value.to_string());
138
139    let headers: Vec<_> = message
140        .headers_raw()
141        .map(|(name, value)| EmailHeader {
142            name: name.to_string(),
143            value: value.to_string(),
144        })
145        .collect();
146
147    let mut attachments: Vec<EmailAttachment> = Vec::new();
148    let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
149
150    let text_body = message
151        .text_bodies()
152        .next()
153        .and_then(|body| body.text_contents());
154
155    // Get the HTML body
156    let mut html_body = message
157        .html_bodies()
158        .next()
159        .and_then(|body| body.text_contents())
160        .map(|value| value.to_string());
161
162    let text_content = match (text_body.as_ref(), html_body.as_ref()) {
163        // Clean the text content removing any HTML
164        (Some(value), _) => {
165            Some(
166                html_to_text(value)
167                    // Fallback to the email html_to_text on failure (it is infallible)
168                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
169            )
170        }
171        // Attempt extracting text content from the HTMl
172        (_, Some(value)) => {
173            Some(
174                html_to_text(value)
175                    // Fallback to the email html_to_text on failure (it is infallible)
176                    .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
177            )
178        }
179        _ => None,
180    };
181
182    for attachment in message.attachments() {
183        let name = match attachment.attachment_name().map(|value| value.to_string()) {
184            Some(value) => value,
185            None => {
186                tracing::warn!("ignoring attachment without name");
187                continue;
188            }
189        };
190
191        let length = attachment.len();
192        let raw_mime = match attachment
193            .content_type()
194            .map(|value| match value.subtype() {
195                Some(subtype) => format!("{}/{}", value.c_type, subtype),
196                None => format!("{}", value.c_type),
197            }) {
198            Some(value) => value,
199            None => {
200                tracing::warn!(?name, ?length, "ignoring attachment without mime type");
201                continue;
202            }
203        };
204
205        let mime: Mime = match raw_mime.parse() {
206            Ok(value) => value,
207            Err(cause) => {
208                tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
209                continue;
210            }
211        };
212
213        let is_inline = attachment
214            .content_disposition()
215            .is_some_and(|value| value.is_inline());
216
217        // For inline attachments with a content_id we inline them as base64 strings
218        // directly into the email content
219        if let (true, Some(content_id), Some(html_body)) =
220            (is_inline, attachment.content_id(), html_body.as_mut())
221        {
222            // Create a data URL for the content
223            let data = attachment.contents();
224            let base64_data = BASE64_STANDARD.encode(data);
225            let data_uri = format!("data:{};base64,{}", raw_mime, base64_data);
226
227            let key = format!("cid:{content_id}");
228
229            // Replace usages of the CID with the inline variant
230            let new_body = html_body.replace(&key, &data_uri);
231            *html_body = new_body;
232            continue;
233        }
234
235        attachments.push(EmailAttachment {
236            name: name.clone(),
237            length,
238            mime: raw_mime,
239        });
240
241        // Capture attachments if allowed
242        if is_allowed_attachments {
243            let bytes = attachment.contents();
244            let bytes = Bytes::copy_from_slice(bytes);
245            additional_files.push(AdditionalProcessingFile {
246                fixed_id: None,
247                name,
248                mime,
249                bytes,
250            });
251        }
252    }
253
254    let document = EmailMetadataDocument {
255        from,
256        to,
257        cc,
258        bcc,
259        subject,
260        date,
261        message_id,
262        headers,
263        attachments,
264    };
265
266    let metadata_bytes = match serde_json::to_vec(&document) {
267        Ok(value) => value,
268        Err(cause) => {
269            tracing::error!(?cause, "failed to serialize email json metadata document");
270            return Err(ProcessingError::InternalServerError);
271        }
272    };
273
274    let pages = text_content.as_ref().map(|value| {
275        vec![DocumentPage {
276            content: value.to_string(),
277            page: 0,
278        }]
279    });
280
281    let index_metadata = ProcessingIndexMetadata { pages };
282    let mut upload_queue = vec![QueuedUpload::new(
283        mime::APPLICATION_JSON,
284        GeneratedFileType::Metadata,
285        metadata_bytes.into(),
286    )];
287
288    if let Some(html_body) = html_body {
289        upload_queue.push(QueuedUpload::new(
290            mime::TEXT_HTML,
291            GeneratedFileType::HtmlContent,
292            html_body.into_bytes().into(),
293        ));
294    }
295
296    if let Some(text_body) = text_content {
297        upload_queue.push(QueuedUpload::new(
298            mime::TEXT_PLAIN,
299            GeneratedFileType::TextContent,
300            text_body.into_bytes().into(),
301        ));
302    }
303
304    Ok(ProcessingOutput {
305        encrypted: false,
306        additional_files,
307        index_metadata: Some(index_metadata),
308        upload_queue,
309    })
310}