1use crate::{
2 files::{generated::QueuedUpload, upload_file::ProcessingConfig},
3 processing::html_to_text::html_to_text,
4};
5use base64::{Engine, prelude::BASE64_STANDARD};
6use bytes::Bytes;
7use docbox_database::models::generated_file::GeneratedFileType;
8use docbox_search::models::DocumentPage;
9use mail_parser::{
10 Address, MessageParser, MimeHeaders, decoders::html::html_to_text as mail_html_to_text,
11};
12use mime::Mime;
13use serde::{Deserialize, Serialize};
14
15use super::{AdditionalProcessingFile, ProcessingError, ProcessingIndexMetadata, ProcessingOutput};
16
17pub fn is_mail_mime(mime: &Mime) -> bool {
19 mime.essence_str() == "message/rfc822"
20}
21
22#[derive(Debug, Serialize, Deserialize)]
24pub struct EmailMetadataDocument {
25 pub from: EmailEntity,
27 pub to: Vec<EmailEntity>,
29 pub cc: Vec<EmailEntity>,
31 pub bcc: Vec<EmailEntity>,
33 pub subject: Option<String>,
35 pub date: Option<String>,
37 pub message_id: Option<String>,
39 pub headers: Vec<EmailHeader>,
41 pub attachments: Vec<EmailAttachment>,
43}
44
45#[derive(Debug, Serialize, Deserialize)]
46pub struct EmailAttachment {
47 pub name: String,
49 pub length: usize,
50 pub mime: String,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct EmailHeader {
55 pub name: String,
56 pub value: String,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
62pub struct EmailEntity {
63 pub name: Option<String>,
64 pub address: Option<String>,
65}
66
67fn map_email_address(address: Option<&Address<'_>>) -> Vec<EmailEntity> {
69 let address = match address {
70 Some(value) => value,
71 None => return Vec::new(),
72 };
73
74 match address {
75 Address::List(addresses) => addresses
76 .iter()
77 .map(|value| EmailEntity {
78 address: value.address().map(|value| value.to_string()),
79 name: value.name().map(|value| value.to_string()),
80 })
81 .collect(),
82 Address::Group(groups) => groups
83 .iter()
84 .flat_map(|group| group.addresses.iter())
85 .map(|value| EmailEntity {
86 address: value.address().map(|value| value.to_string()),
87 name: value.name().map(|value| value.to_string()),
88 })
89 .collect(),
90 }
91}
92
93pub fn process_email(
94 config: &Option<ProcessingConfig>,
95 file_bytes: &[u8],
96) -> Result<ProcessingOutput, ProcessingError> {
97 let is_allowed_attachments = config
98 .as_ref()
99 .is_none_or(|config| {
101 config
103 .email
104 .as_ref()
105 .is_none_or(|email| email.skip_attachments.is_none_or(|value| !value))
107 });
108
109 let parser = MessageParser::default();
110 let message = match parser.parse(file_bytes) {
111 Some(value) => value,
112 None => {
113 return Ok(ProcessingOutput::default());
115 }
116 };
117
118 let from = map_email_address(message.from());
119
120 let from = from
121 .first()
122 .ok_or_else(|| {
124 ProcessingError::MalformedFile("email must have at least one sender".to_string())
125 })?
126 .clone();
127
128 let to = map_email_address(message.to());
129 let cc = map_email_address(message.cc());
130 let bcc = map_email_address(message.bcc());
131
132 let subject = message.subject().map(|value| value.to_string());
133 let date = message
134 .date()
135 .map(|value| value.to_rfc3339());
137 let message_id = message.message_id().map(|value| value.to_string());
138
139 let headers: Vec<_> = message
140 .headers_raw()
141 .map(|(name, value)| EmailHeader {
142 name: name.to_string(),
143 value: value.to_string(),
144 })
145 .collect();
146
147 let mut attachments: Vec<EmailAttachment> = Vec::new();
148 let mut additional_files: Vec<AdditionalProcessingFile> = Vec::new();
149
150 let text_body = message
151 .text_bodies()
152 .next()
153 .and_then(|body| body.text_contents());
154
155 let mut html_body = message
157 .html_bodies()
158 .next()
159 .filter(|body| body.is_text_html())
160 .and_then(|body| body.text_contents())
161 .map(|value| value.to_string());
162
163 let text_content = match (text_body.as_ref(), html_body.as_ref()) {
164 (Some(value), _) => {
166 Some(
167 html_to_text(value)
168 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
170 )
171 }
172 (_, Some(value)) => {
174 Some(
175 html_to_text(value)
176 .unwrap_or_else(|_| mail_html_to_text(value).to_string()),
178 )
179 }
180 _ => None,
181 };
182
183 for attachment in message.attachments() {
184 let name = match attachment.attachment_name().map(|value| value.to_string()) {
185 Some(value) => value,
186 None => {
187 tracing::warn!("ignoring attachment without name");
188 continue;
189 }
190 };
191
192 let length = attachment.len();
193 let raw_mime = match attachment
194 .content_type()
195 .map(|value| match value.subtype() {
196 Some(subtype) => format!("{}/{}", value.c_type, subtype),
197 None => format!("{}", value.c_type),
198 }) {
199 Some(value) => value,
200 None => {
201 tracing::warn!(?name, ?length, "ignoring attachment without mime type");
202 continue;
203 }
204 };
205
206 let mime: Mime = match raw_mime.parse() {
207 Ok(value) => value,
208 Err(cause) => {
209 tracing::error!(?cause, ?raw_mime, "invalid email attachment file mime type");
210 continue;
211 }
212 };
213
214 let is_inline = attachment
215 .content_disposition()
216 .is_some_and(|value| value.is_inline());
217
218 if let (true, Some(content_id), Some(html_body)) =
221 (is_inline, attachment.content_id(), html_body.as_mut())
222 {
223 let data = attachment.contents();
225 let base64_data = BASE64_STANDARD.encode(data);
226 let data_uri = format!("data:{raw_mime};base64,{base64_data}");
227
228 let key = format!("cid:{content_id}");
229
230 let new_body = html_body.replace(&key, &data_uri);
232 *html_body = new_body;
233 continue;
234 }
235
236 attachments.push(EmailAttachment {
237 name: name.clone(),
238 length,
239 mime: raw_mime,
240 });
241
242 if is_allowed_attachments {
244 let bytes = attachment.contents();
245 let bytes = Bytes::copy_from_slice(bytes);
246 additional_files.push(AdditionalProcessingFile {
247 fixed_id: None,
248 name,
249 mime,
250 bytes,
251 });
252 }
253 }
254
255 let document = EmailMetadataDocument {
256 from,
257 to,
258 cc,
259 bcc,
260 subject,
261 date,
262 message_id,
263 headers,
264 attachments,
265 };
266
267 let metadata_bytes = match serde_json::to_vec(&document) {
268 Ok(value) => value,
269 Err(cause) => {
270 tracing::error!(?cause, "failed to serialize email json metadata document");
271 return Err(ProcessingError::InternalServerError);
272 }
273 };
274
275 let pages = text_content.as_ref().map(|value| {
276 vec![DocumentPage {
277 content: value.to_string(),
278 page: 0,
279 }]
280 });
281
282 let index_metadata = ProcessingIndexMetadata { pages };
283 let mut upload_queue = vec![QueuedUpload::new(
284 mime::APPLICATION_JSON,
285 GeneratedFileType::Metadata,
286 metadata_bytes.into(),
287 )];
288
289 if let Some(html_body) = html_body {
290 upload_queue.push(QueuedUpload::new(
291 mime::TEXT_HTML,
292 GeneratedFileType::HtmlContent,
293 html_body.into_bytes().into(),
294 ));
295 }
296
297 if let Some(text_body) = text_content {
298 upload_queue.push(QueuedUpload::new(
299 mime::TEXT_PLAIN,
300 GeneratedFileType::TextContent,
301 text_body.into_bytes().into(),
302 ));
303 }
304
305 Ok(ProcessingOutput {
306 encrypted: false,
307 additional_files,
308 index_metadata: Some(index_metadata),
309 upload_queue,
310 })
311}