1use crate::{ProcessingConfig, QueuedUpload, html_to_text::html_to_text};
2use base64::{Engine, prelude::BASE64_STANDARD};
3use bytes::Bytes;
4use docbox_database::models::generated_file::GeneratedFileType;
5use docbox_search::models::DocumentPage;
6use mail_parser::{
7 Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
8};
9use mime::Mime;
10use serde::{Deserialize, Serialize};
11use thiserror::Error;
12
13use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
14
15#[derive(Debug, Error)]
16pub enum EmailProcessingError {
17 #[error("email must have at least one sender")]
19 MissingSender,
20
21 #[error("failed to serialize email metadata")]
23 MetadataSerialize(serde_json::Error),
24}
25
26pub fn is_mail_mime(mime: &Mime) -> bool {
28 mime.essence_str() == "message/rfc822"
29}
30
31#[derive(Debug, Serialize, Deserialize)]
33pub struct EmailMetadataDocument {
34 pub from: EmailEntity,
36 pub to: Vec<EmailEntity>,
38 pub cc: Vec<EmailEntity>,
40 pub bcc: Vec<EmailEntity>,
42 pub subject: Option<String>,
44 pub date: Option<String>,
46 pub message_id: Option<String>,
48 pub headers: Vec<EmailHeader>,
50 pub attachments: Vec<EmailAttachment>,
52}
53
54#[derive(Debug, Serialize, Deserialize)]
55pub struct EmailAttachment {
56 pub name: String,
58 pub length: usize,
59 pub mime: String,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct EmailHeader {
64 pub name: String,
65 pub value: String,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
71pub struct EmailEntity {
72 pub name: Option<String>,
73 pub address: Option<String>,
74}
75
76fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
78 let address = match address {
79 Some(value) => value,
80 None => return Vec::new(),
81 };
82
83 match address {
84 Address::List(addresses) => addresses
85 .iter()
86 .map(|value| EmailEntity {
87 address: value.address().map(|value| value.to_string()),
88 name: value.name().map(|value| value.to_string()),
89 })
90 .collect(),
91 Address::Group(groups) => groups
92 .iter()
93 .flat_map(|group| group.addresses.iter())
94 .map(|value| EmailEntity {
95 address: value.address().map(|value| value.to_string()),
96 name: value.name().map(|value| value.to_string()),
97 })
98 .collect(),
99 }
100}
101
102pub fn process_email(
103 config: &Option<ProcessingConfig>,
104 file_bytes: &[u8],
105) -> Result<ProcessingOutput, ProcessingError> {
106 let is_allowed_attachments = config
107 .as_ref()
108 .is_none_or(|config| {
110 config
112 .email
113 .as_ref()
114 .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
116 });
117
118 let parser = MessageParser::default();
119 let message = match parser.parse(file_bytes) {
120 Some(value) => value,
121 None => {
122 return Ok(ProcessingOutput::default());
124 }
125 };
126
127 let from = map_email_address(message.from());
128
129 let from = from
130 .first()
131 .ok_or(EmailProcessingError::MissingSender)?
133 .clone();
134
135 let to = map_email_address(message.to());
136 let cc = map_email_address(message.cc());
137 let bcc = map_email_address(message.bcc());
138
139 let subject = message.subject().map(|value| value.to_string());
140 let date = message
141 .date()
142 .map(|value| value.to_rfc3339());
144 let message_id = message.message_id().map(|value| value.to_string());
145
146 let headers: Vec<_> = message
147 .headers_raw()
148 .map(|(name, value)| EmailHeader {
149 name: name.to_string(),
150 value: value.to_string(),
151 })
152 .collect();
153
154 let mut attachments: Vec<EmailAttachment> = Vec::new();
155 let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
156
157 let text_body = message
158 .text_bodies()
159 .next()
160 .and_then(|body| body.text_contents());
161
162 let mut html_body = message
164 .html_bodies()
165 .next()
166 .filter(|body| body.is_text_html())
167 .and_then(|body| body.text_contents())
168 .map(|value| value.to_string());
169
170 let text_content = match (text_body.as_ref(), html_body.as_ref()) {
171 (Some(value), _) => {
173 Some(
174 html_to_text(value)
175 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
177 )
178 }
179 (_, Some(value)) => {
181 Some(
182 html_to_text(value)
183 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
185 )
186 }
187 _ => None,
188 };
189
190 for attachment in message.attachments() {
191 let name = match attachment.attachment_name().map(|value| value.to_string()) {
192 Some(value) => value,
193 None => {
194 tracing::warn!("ignoring attachment without name");
195 continue;
196 }
197 };
198
199 let length = attachment.len();
200 let raw_mime = match attachment
201 .content_type()
202 .map(|value| match value.subtype() {
203 Some(subtype) => format!("{}/{}", value.c_type, subtype),
204 None => format!("{}", value.c_type),
205 }) {
206 Some(value) => value,
207 None => {
208 tracing::warn!(?name, ?length, "ignoring attachment without mime type");
209 continue;
210 }
211 };
212
213 let mime: Mime = match raw_mime.parse() {
214 Ok(value) => value,
215 Err(cause) => {
216 tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
217 continue;
218 }
219 };
220
221 let is_inline = attachment
222 .content_disposition()
223 .is_some_and(|value| value.is_inline());
224
225 if let (true, Some(content_id), Some(html_body)) =
228 (is_inline, attachment.content_id(), html_body.as_mut())
229 {
230 let data = attachment.contents();
232 let base64_data = BASE64_STANDARD.encode(data);
233 let data_uri = format!("data:{raw_mime};base64,{base64_data}");
234
235 let key = format!("cid:{content_id}");
236
237 let new_body = html_body.replace(&key, &data_uri);
239 *html_body = new_body;
240 continue;
241 }
242
243 attachments.push(EmailAttachment {
244 name: name.clone(),
245 length,
246 mime: raw_mime,
247 });
248
249 if is_allowed_attachments {
251 let bytes = attachment.contents();
252 let bytes = Bytes::copy_from_slice(bytes);
253 additional_files.push(AdditionalProcessingFile {
254 fixed_id: None,
255 name,
256 mime,
257 bytes,
258 });
259 }
260 }
261
262 let document = EmailMetadataDocument {
263 from,
264 to,
265 cc,
266 bcc,
267 subject,
268 date,
269 message_id,
270 headers,
271 attachments,
272 };
273
274 let metadata_bytes = match serde_json::to_vec(&document) {
275 Ok(value) => value,
276 Err(cause) => {
277 tracing::error!(?cause, "failed to serialize email json metadata document");
278 return Err(ProcessingError::Email(
279 EmailProcessingError::MetadataSerialize(cause),
280 ));
281 }
282 };
283
284 let pages = text_content.as_ref().map(|value| {
285 vec![DocumentPage {
286 content: value.to_string(),
287 page: 0,
288 }]
289 });
290
291 let index_metadata = ProcessingIndexMetadata { pages };
292 let mut upload_queue = vec![QueuedUpload::new(
293 mime::APPLICATION_JSON,
294 GeneratedFileType::Metadata,
295 metadata_bytes.into(),
296 )];
297
298 if let Some(html_body) = html_body {
299 upload_queue.push(QueuedUpload::new(
300 mime::TEXT_HTML,
301 GeneratedFileType::HtmlContent,
302 html_body.into_bytes().into(),
303 ));
304 }
305
306 if let Some(text_body) = text_content {
307 upload_queue.push(QueuedUpload::new(
308 mime::TEXT_PLAIN,
309 GeneratedFileType::TextContent,
310 text_body.into_bytes().into(),
311 ));
312 }
313
314 Ok(ProcessingOutput {
315 encrypted: false,
316 additional_files,
317 index_metadata: Some(index_metadata),
318 upload_queue,
319 })
320}