1use crate::{
2 files::{generated::QueuedUpload, upload_file::ProcessingConfig},
3 processing::html_to_text::html_to_text,
4};
5use base64::{Engine, prelude::BASE64_STANDARD};
6use bytes::Bytes;
7use docbox_database::models::generated_file::GeneratedFileType;
8use docbox_search::models::DocumentPage;
9use mail_parser::{
10 Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
11};
12use mime::Mime;
13use serde::{Deserialize, Serialize};
14
15use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
16
17pub fn is_mail_mime(mime: &Mime) -> bool {
19 mime.essence_str() == "message/rfc822"
20}
21
22#[derive(Debug, Serialize, Deserialize)]
24pub struct EmailMetadataDocument {
25 pub from: EmailEntity,
27 pub to: Vec<EmailEntity>,
29 pub cc: Vec<EmailEntity>,
31 pub bcc: Vec<EmailEntity>,
33 pub subject: Option<String>,
35 pub date: Option<String>,
37 pub message_id: Option<String>,
39 pub headers: Vec<EmailHeader>,
41 pub attachments: Vec<EmailAttachment>,
43}
44
45#[derive(Debug, Serialize, Deserialize)]
46pub struct EmailAttachment {
47 pub name: String,
49 pub length: usize,
50 pub mime: String,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct EmailHeader {
55 pub name: String,
56 pub value: String,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
62pub struct EmailEntity {
63 pub name: Option<String>,
64 pub address: Option<String>,
65}
66
67fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
69 let address = match address {
70 Some(value) => value,
71 None => return Vec::new(),
72 };
73
74 match address {
75 Address::List(addresses) => addresses
76 .iter()
77 .map(|value| EmailEntity {
78 address: value.address().map(|value| value.to_string()),
79 name: value.name().map(|value| value.to_string()),
80 })
81 .collect(),
82 Address::Group(groups) => groups
83 .iter()
84 .flat_map(|group| group.addresses.iter())
85 .map(|value| EmailEntity {
86 address: value.address().map(|value| value.to_string()),
87 name: value.name().map(|value| value.to_string()),
88 })
89 .collect(),
90 }
91}
92
93pub fn process_email(
94 config: &Option<ProcessingConfig>,
95 file_bytes: &[u8],
96) -> Result<ProcessingOutput, ProcessingError> {
97 let is_allowed_attachments = config
98 .as_ref()
99 .is_none_or(|config| {
101 config
103 .email
104 .as_ref()
105 .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
107 });
108
109 let parser = MessageParser::default();
110 let message = match parser.parse(file_bytes) {
111 Some(value) => value,
112 None => {
113 return Ok(ProcessingOutput::default());
115 }
116 };
117
118 let from = map_email_address(message.from());
119
120 let from = from
121 .first()
122 .ok_or_else(|| {
124 ProcessingError::MalformedFile("email must have at least one sender".to_string())
125 })?
126 .clone();
127
128 let to = map_email_address(message.to());
129 let cc = map_email_address(message.cc());
130 let bcc = map_email_address(message.bcc());
131
132 let subject = message.subject().map(|value| value.to_string());
133 let date = message
134 .date()
135 .map(|value| value.to_rfc3339());
137 let message_id = message.message_id().map(|value| value.to_string());
138
139 let headers: Vec<_> = message
140 .headers_raw()
141 .map(|(name, value)| EmailHeader {
142 name: name.to_string(),
143 value: value.to_string(),
144 })
145 .collect();
146
147 let mut attachments: Vec<EmailAttachment> = Vec::new();
148 let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
149
150 let text_body = message
151 .text_bodies()
152 .next()
153 .and_then(|body| body.text_contents());
154
155 let mut html_body = message
157 .html_bodies()
158 .next()
159 .and_then(|body| body.text_contents())
160 .map(|value| value.to_string());
161
162 let text_content = match (text_body.as_ref(), html_body.as_ref()) {
163 (Some(value), _) => {
165 Some(
166 html_to_text(value)
167 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
169 )
170 }
171 (_, Some(value)) => {
173 Some(
174 html_to_text(value)
175 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
177 )
178 }
179 _ => None,
180 };
181
182 for attachment in message.attachments() {
183 let name = match attachment.attachment_name().map(|value| value.to_string()) {
184 Some(value) => value,
185 None => {
186 tracing::warn!("ignoring attachment without name");
187 continue;
188 }
189 };
190
191 let length = attachment.len();
192 let raw_mime = match attachment
193 .content_type()
194 .map(|value| match value.subtype() {
195 Some(subtype) => format!("{}/{}", value.c_type, subtype),
196 None => format!("{}", value.c_type),
197 }) {
198 Some(value) => value,
199 None => {
200 tracing::warn!(?name, ?length, "ignoring attachment without mime type");
201 continue;
202 }
203 };
204
205 let mime: Mime = match raw_mime.parse() {
206 Ok(value) => value,
207 Err(cause) => {
208 tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
209 continue;
210 }
211 };
212
213 let is_inline = attachment
214 .content_disposition()
215 .is_some_and(|value| value.is_inline());
216
217 if let (true, Some(content_id), Some(html_body)) =
220 (is_inline, attachment.content_id(), html_body.as_mut())
221 {
222 let data = attachment.contents();
224 let base64_data = BASE64_STANDARD.encode(data);
225 let data_uri = format!("data:{};base64,{}", raw_mime, base64_data);
226
227 let key = format!("cid:{content_id}");
228
229 let new_body = html_body.replace(&key, &data_uri);
231 *html_body = new_body;
232 continue;
233 }
234
235 attachments.push(EmailAttachment {
236 name: name.clone(),
237 length,
238 mime: raw_mime,
239 });
240
241 if is_allowed_attachments {
243 let bytes = attachment.contents();
244 let bytes = Bytes::copy_from_slice(bytes);
245 additional_files.push(AdditionalProcessingFile {
246 fixed_id: None,
247 name,
248 mime,
249 bytes,
250 });
251 }
252 }
253
254 let document = EmailMetadataDocument {
255 from,
256 to,
257 cc,
258 bcc,
259 subject,
260 date,
261 message_id,
262 headers,
263 attachments,
264 };
265
266 let metadata_bytes = match serde_json::to_vec(&document) {
267 Ok(value) => value,
268 Err(cause) => {
269 tracing::error!(?cause, "failed to serialize email json metadata document");
270 return Err(ProcessingError::InternalServerError);
271 }
272 };
273
274 let pages = text_content.as_ref().map(|value| {
275 vec![DocumentPage {
276 content: value.to_string(),
277 page: 0,
278 }]
279 });
280
281 let index_metadata = ProcessingIndexMetadata { pages };
282 let mut upload_queue = vec![QueuedUpload::new(
283 mime::APPLICATION_JSON,
284 GeneratedFileType::Metadata,
285 metadata_bytes.into(),
286 )];
287
288 if let Some(html_body) = html_body {
289 upload_queue.push(QueuedUpload::new(
290 mime::TEXT_HTML,
291 GeneratedFileType::HtmlContent,
292 html_body.into_bytes().into(),
293 ));
294 }
295
296 if let Some(text_body) = text_content {
297 upload_queue.push(QueuedUpload::new(
298 mime::TEXT_PLAIN,
299 GeneratedFileType::TextContent,
300 text_body.into_bytes().into(),
301 ));
302 }
303
304 Ok(ProcessingOutput {
305 encrypted: false,
306 additional_files,
307 index_metadata: Some(index_metadata),
308 upload_queue,
309 })
310}