Skip to main content

br_email/
analyze.rs

1use br_crypto::encoding::code_to_utf8;
2use chrono::{DateTime, Local, TimeZone};
3use json::{object, JsonValue};
4use regex::Regex;
5use std::collections::HashMap;
6use std::ffi::OsStr;
7use std::io::{Error, ErrorKind, Write};
8use std::{env, fs, io};
9
10/// 解析邮件
11#[derive(Debug)]
12pub struct AnalyzeEmails {
13    pub debug: bool,
14    pub header: HashMap<String, String>,
15    pub mime_version: String,
16    boundary: String,
17    pub md5: String,
18    pub size: usize,
19    /// 时间戳
20    pub timestamp: i64,
21    /// 本地时间
22    pub datetime: String,
23    /// 主题
24    pub subject: String,
25    /// 发件人
26    pub from: HashMap<String, String>,
27    /// 收件人
28    pub to: HashMap<String, String>,
29    /// 抄送人
30    pub cc: HashMap<String, String>,
31    /// 用于指定收件人回复邮件时应该使用的电子邮件地址
32    pub replyto: HashMap<String, String>,
33    /// 内容类型
34    pub content_type: String,
35    /// 编码规则
36    pub content_transfer_encoding: ContentTransferEncoding,
37    /// 实际发件人
38    pub sender: String,
39    pub body_text: String,
40    pub body_html: String,
41    pub files: JsonValue,
42    pub charset: String,
43}
44
45impl AnalyzeEmails {
46    pub fn new(mut data: Vec<u8>, debug: bool) -> io::Result<AnalyzeEmails> {
47        let md5 = br_crypto::md5::encrypt_hex(&data.clone())
48            .leak()
49            .to_string();
50        let size = data.len();
51        let data_string = unsafe { String::from_utf8_unchecked(data.clone()) };
52        if data_string.contains("\n\n") {
53            let updated_string = data_string.replace("\n", "\r\n");
54            data = updated_string.as_bytes().to_vec();
55        }
56
57        let subsequence = "\r\n\r\n".as_bytes();
58
59        let (header, body) = match data
60            .windows(subsequence.len())
61            .position(|window| window == subsequence)
62        {
63            None => {
64                if debug {
65                    fs::write(
66                        format!(
67                            "{}/xygs-{}.eml",
68                            env::current_dir().unwrap().to_str().unwrap(),
69                            md5
70                        ),
71                        data.clone(),
72                    )?;
73                }
74                return Err(Error::other(format!("协议格式错误: {md5}")));
75            }
76            Some(e) => (data[..e].to_vec(), data[e + 4..].to_vec()),
77        };
78        let mut that = Self {
79            debug,
80            header: Default::default(),
81            mime_version: "".to_string(),
82            boundary: "".to_string(),
83            md5,
84            size,
85            timestamp: 0,
86            subject: "".to_string(),
87            from: Default::default(),
88            to: Default::default(),
89            cc: Default::default(),
90            replyto: Default::default(),
91            datetime: "".to_string(),
92            content_type: "".to_string(),
93            content_transfer_encoding: ContentTransferEncoding::Bit7,
94            sender: "".to_string(),
95            body_text: "".to_string(),
96            body_html: "".to_string(),
97            files: object! {},
98            charset: "utf-8".to_string(),
99        };
100        that.header(header)?;
101        that.body(body, data_string)?;
102        Ok(that)
103    }
104
105    fn header(&mut self, data: Vec<u8>) -> io::Result<()> {
106        let data = unsafe { String::from_utf8_unchecked(data) };
107        let data = data.replace("\r\n\t", "").replace("\r\n ", " ").leak();
108        for item in data.lines() {
109            let (key, value) = match item.find(": ") {
110                Some(e) => (&item[..e], &item[e + 2..]),
111                None => match item.find(":") {
112                    Some(e) => (&item[..e], &item[e + 1..]),
113                    None => continue,
114                },
115            };
116            let name = key.to_lowercase().leak();
117            if value.is_empty() {
118                continue;
119            }
120            match key.to_lowercase().as_str() {
121                "mime-version" => self.mime_version = value.to_string(),
122                "from" => {
123                    self.from = self.from(value);
124                }
125                "sender" => {
126                    self.sender = value.to_string();
127                }
128                "to" => {
129                    self.to = self.email_encoded(value);
130                }
131                "cc" => {
132                    self.cc = self.email_encoded(value);
133                }
134                "reply-to" => {
135                    self.replyto = self.email_encoded(value);
136                }
137                "subject" => {
138                    self.subject = self.subject(value.to_string());
139                }
140                "content-type" => {
141                    let types = value.split(";").collect::<Vec<&str>>();
142                    self.content_type = types[0].trim().to_lowercase().to_string();
143                    match self.content_type.as_str() {
144                        "multipart/mixed"
145                        | "multipart/alternative"
146                        | "multipart/related"
147                        | "multipart/report" => match types[1].find("boundary=") {
148                            None => {}
149                            Some(e) => {
150                                let boundary = &types[1][e..];
151                                self.boundary = boundary
152                                    .trim()
153                                    .trim_start_matches("boundary=")
154                                    .trim_start_matches("\"")
155                                    .trim_end_matches("\"")
156                                    .to_string();
157                            }
158                        },
159                        _ => {}
160                    }
161                    if types.len() > 1 {
162                        for item in types.iter() {
163                            if item.contains("charset=") {
164                                self.charset = item
165                                    .trim_start_matches("charset=")
166                                    .trim_start_matches("\"")
167                                    .trim_end_matches("\"")
168                                    .to_string();
169                            }
170                        }
171                    }
172                }
173                "content-transfer-encoding" => {
174                    self.content_transfer_encoding = ContentTransferEncoding::from(value);
175                }
176                "date" => self.datetime(value)?,
177                _ => {
178                    self.header
179                        .insert(name.trim().to_string(), value.to_string());
180                }
181            }
182        }
183        Ok(())
184    }
185    fn body(&mut self, data: Vec<u8>, old_data: String) -> io::Result<()> {
186        match self.content_type.to_lowercase().as_str() {
187            "text/html" => {
188                let data = self.content_transfer_encoding.decode(data)?;
189                let res = code_to_utf8(self.charset.as_str(), data.clone());
190                self.body_html = res;
191            }
192            "text/plain" => {
193                let data = self.content_transfer_encoding.decode(data)?;
194                let res = code_to_utf8(self.charset.as_str(), data.clone());
195                self.body_text = res;
196            }
197            "multipart/mixed"
198            | "multipart/alternative"
199            | "multipart/related"
200            | "multipart/report" => {
201                let data = self.content_transfer_encoding.decode(data.clone())?;
202                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
203                let mut parts_list = vec![];
204                let mut text = String::new();
205
206                parts = match parts.find(self.boundary.as_str()) {
207                    None => parts,
208                    Some(e) => parts[e..].to_string(),
209                };
210                for item in parts.lines() {
211                    if item.contains(self.boundary.as_str()) && text.is_empty() {
212                        continue;
213                    }
214                    if item.contains(self.boundary.as_str()) && text.clone() != "" {
215                        parts_list.push(text.clone());
216                        text = String::new();
217                        continue;
218                    }
219                    text = format!("{text}{item}\r\n");
220                }
221                for part in parts_list {
222                    if part.trim().is_empty() {
223                        continue;
224                    }
225                    self.parts(part.to_string(), old_data.clone())?;
226                }
227            }
228            _ => {
229                return Err(Error::new(
230                    ErrorKind::NotFound,
231                    format!("未知body类型: {}", self.content_type),
232                ));
233            }
234        }
235        Ok(())
236    }
237    /// 部分内容处理
238    fn parts(&mut self, data: String, old_data: String) -> io::Result<()> {
239        let (header, body) = match data.find("\r\n\r\n") {
240            None => {
241                if self.debug {
242                    fs::write(
243                        format!(
244                            "{}/head-{}.eml",
245                            env::current_dir().unwrap().to_str().unwrap(),
246                            self.md5
247                        ),
248                        old_data.clone(),
249                    )?;
250                }
251                return Err(Error::other("解析附件头失败"));
252            }
253            Some(e) => (
254                &data[..e]
255                    .replace("\r\n\t", " ")
256                    .replace("\r\n ", " ")
257                    .leak()
258                    .lines(),
259                &data[e + 4..],
260            ),
261        };
262
263        let mut filename = "".to_string();
264        let mut content_type = "";
265        let mut boundary = "";
266        let mut content_transfer_encoding = ContentTransferEncoding::None;
267        for item in header.clone() {
268            let (key, value) = match item.find(": ") {
269                Some(e) => (&item[..e], &item[e + 2..]),
270                None => match item.find(":") {
271                    Some(e) => (&item[..e], &item[e + 1..]),
272                    None => continue,
273                },
274            };
275
276            let name = key.to_lowercase();
277
278            match name.trim() {
279                "content-transfer-encoding" => {
280                    content_transfer_encoding = ContentTransferEncoding::from(value)
281                }
282                "content-type" => {
283                    let types = value.trim().split(";").collect::<Vec<&str>>();
284                    content_type = types[0].trim();
285                    let name = types
286                        .iter()
287                        .filter(|&x| x.trim().starts_with("name="))
288                        .map(|&x| x.trim().to_string())
289                        .collect::<Vec<String>>();
290                    if !name.is_empty() {
291                        let name = name[0].trim_start_matches("name=");
292                        filename = self.encoded(name);
293                    }
294                    match value.find("boundary=") {
295                        None => {}
296                        Some(i) => {
297                            boundary = &value[i + 9..];
298                            boundary = match boundary.find(";") {
299                                None => boundary,
300                                Some(i) => &boundary[..i],
301                            };
302                            boundary = boundary.trim_start_matches("\"").trim_end_matches("\"");
303                        }
304                    }
305                }
306                "content-id"
307                | "content-length"
308                | "mime-version"
309                | "content-description"
310                | "date"
311                | "x-attachment-id" => {}
312                "content-disposition" => {
313                    if filename.is_empty() && value.contains("filename=") {
314                        filename = value.split("filename=").collect::<Vec<&str>>()[1]
315                            .trim_start_matches("\"")
316                            .trim_end_matches("\"")
317                            .to_string();
318                    }
319                    if filename.is_empty() && value.contains("filename*=utf-8''") {
320                        filename = value.split("filename*=utf-8''").collect::<Vec<&str>>()[1]
321                            .trim_start_matches("\"")
322                            .trim_end_matches("\"")
323                            .to_string();
324                        filename = br_crypto::encoding::urlencoding_decode(filename.as_str());
325                    }
326                }
327                _ => {
328                    return Err(Error::new(
329                        ErrorKind::NotFound,
330                        format!("parts 未知 header 类型: {name} [{item}]"),
331                    ));
332                }
333            }
334        }
335
336        match content_type {
337            "text/plain" => {
338                if filename.is_empty() {
339                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
340                    let text = code_to_utf8(self.charset.as_str(), res.clone());
341                    self.body_text = text;
342                } else {
343                    self.set_files(
344                        content_transfer_encoding,
345                        body,
346                        filename.as_str(),
347                        "".to_string(),
348                    )?;
349                }
350            }
351            "text/html" | "text/x-amp-html" => {
352                if filename.is_empty() {
353                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
354                    self.body_html = code_to_utf8(self.charset.as_str(), res.clone());
355                } else {
356                    self.set_files(
357                        content_transfer_encoding,
358                        body,
359                        filename.as_str(),
360                        "".to_string(),
361                    )?;
362                }
363            }
364            "multipart/mixed" | "multipart/alternative" | "multipart/related" => {
365                let data = self
366                    .content_transfer_encoding
367                    .decode(body.as_bytes().to_vec())?;
368                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
369
370                parts = match parts.find(self.boundary.as_str()) {
371                    None => parts,
372                    Some(e) => parts[e..].to_string(),
373                };
374
375                let mut parts_list = vec![];
376                let mut text = String::new();
377                for item in parts.lines() {
378                    if item.contains(boundary) && text.is_empty() {
379                        continue;
380                    }
381                    if item.contains(boundary) && !text.is_empty() {
382                        parts_list.push(text);
383                        text = String::new();
384                        continue;
385                    }
386                    text = format!("{text}{item}\r\n");
387                }
388                for part in parts_list {
389                    if part.trim().is_empty() {
390                        continue;
391                    }
392                    self.parts(part.to_string(), old_data.clone())?;
393                }
394            }
395            "text/calendar" => {}
396            "application/octet-stream"
397            | "application/zip"
398            | "application/pdf"
399            | "image/jpeg"
400            | "application/ics"
401            | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
402                if !filename.is_empty() {
403                    self.set_files(
404                        content_transfer_encoding,
405                        body,
406                        filename.as_str(),
407                        content_type.to_string(),
408                    )?;
409                }
410            }
411            _ => {
412                if self.debug {
413                    fs::write(
414                        format!(
415                            "{}/content_type-{}.eml",
416                            env::current_dir().unwrap().to_str().unwrap(),
417                            self.md5
418                        ),
419                        old_data.clone(),
420                    )?;
421                }
422                return Err(Error::new(
423                    ErrorKind::NotFound,
424                    format!("未知 parts content_type 类型: {content_type}"),
425                ));
426            }
427        }
428        Ok(())
429    }
430    pub fn from(&mut self, value: &str) -> HashMap<String, String> {
431        let mut r = value
432            .split("<")
433            .filter(|x| !x.trim().is_empty())
434            .map(|x| x.trim())
435            .collect::<Vec<&str>>();
436        if r[0].starts_with("\"") && r[0].ends_with("\"") {
437            r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"").trim();
438        }
439        let mut emails = HashMap::new();
440        if r.len() == 1 {
441            let name = r[0].trim_end_matches(">").to_string();
442            emails.insert(name.clone(), name);
443        } else {
444            let name = self.encoded(r[0].trim());
445            let email = r[1].trim_end_matches(">").to_string();
446            emails.insert(email, name);
447        }
448        emails
449    }
450    fn subject(&mut self, value: String) -> String {
451        let value = value.replace("?==?", "?=\r\n\t=?");
452        if !value.contains("=?") && !value.contains("?=") {
453            return value.to_string();
454        }
455        let list = value.split("\r\n\t").collect::<Vec<&str>>();
456        let mut txt = vec![];
457        for item in list {
458            txt.push(self.encoded(item));
459        }
460        txt.join("")
461    }
462
463    fn encoded(&mut self, value: &str) -> String {
464        let t = value.trim_start_matches("\"").trim_end_matches("\"");
465        if t.contains("=?") && t.contains("?=") {
466            let l = t.split(" ").collect::<Vec<&str>>();
467            let mut txt = vec![];
468            for item in l {
469                txt.push(self.encoded_line(item));
470            }
471            txt.join("")
472        } else {
473            t.to_string()
474        }
475    }
476    /// 段落解码
477    fn encoded_line(&mut self, value: &str) -> String {
478        let line = value.split("?").collect::<Vec<&str>>();
479        if line.len() == 1 {
480            return value.to_string();
481        }
482        let charset = line[1].to_lowercase().to_string().leak();
483        let code = line[2].to_uppercase();
484        let data = line[3];
485
486        let strs = match code.as_str() {
487            "B" => br_crypto::base64::decode_u8(data),
488            "Q" => br_crypto::qp::decode(data).unwrap_or(vec![]),
489            _ => data.as_bytes().to_vec(),
490        };
491        let text = code_to_utf8(charset, strs.clone());
492        text.chars().filter(|&x| x != '\u{200b}').collect()
493    }
494
495    /// 时间处理
496    fn datetime(&mut self, value: &str) -> io::Result<()> {
497        let re = Regex::new(r"\s*\(.*\)$").unwrap();
498        let datetime = re.replace(value, "").to_string();
499        let datetime = datetime.replace("GMT", "+0000").to_string();
500        let datetime = match datetime.find(",") {
501            None => datetime,
502            Some(i) => datetime[i + 1..].parse().unwrap(),
503        };
504        let datetime = match DateTime::parse_from_str(datetime.as_str(), "%d %b %Y %H:%M:%S %z") {
505            Ok(e) => e,
506            Err(e) => return Err(Error::other(format!("时间解析失败: {e} [{datetime:?}]"))),
507        };
508        self.timestamp = datetime.timestamp();
509        self.datetime = Local
510            .timestamp_opt(self.timestamp, 0)
511            .unwrap()
512            .with_timezone(&Local)
513            .format("%Y-%m-%d %H:%M:%S")
514            .to_string();
515        Ok(())
516    }
517    pub fn email_encoded(&mut self, value: &str) -> HashMap<String, String> {
518        let list = value.split(",").map(|x| x.trim()).collect::<Vec<&str>>();
519        let mut emails = HashMap::new();
520        for item in list {
521            let mut r = item.split(" <").collect::<Vec<&str>>();
522            if r[0].starts_with("\"") && r[0].ends_with("\"") {
523                r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"");
524            }
525            if r.len() == 1 {
526                let name = r[0]
527                    .trim_start_matches("<")
528                    .trim_end_matches(">")
529                    .to_string();
530                emails.insert(name.clone(), name);
531            } else {
532                let name = self.encoded(r[0].trim());
533                let email = r[1].trim_end_matches(">").to_string();
534                emails.insert(email, name);
535            }
536        }
537        emails
538    }
539    fn set_files(
540        &mut self,
541        mut content_transfer_encoding: ContentTransferEncoding,
542        body: &str,
543        filename: &str,
544        mut content_type: String,
545    ) -> io::Result<()> {
546        let mut data = "";
547        if let ContentTransferEncoding::Base64 = content_transfer_encoding {
548            let mut text = "".to_string();
549            for line in body.lines() {
550                text += line;
551            }
552            data = text.leak();
553        }
554
555        let body = content_transfer_encoding.decode(data.as_bytes().to_vec())?;
556        let md5 = br_crypto::md5::encrypt_hex(&body.clone());
557        let size = body.len();
558        let mut temp_dir = env::temp_dir();
559        temp_dir.push(filename);
560        let path_temp_dir = temp_dir.clone();
561
562        let mut temp_file = match fs::File::create(temp_dir.clone()) {
563            Ok(e) => e,
564            Err(e) => {
565                return Err(Error::other(format!(
566                    "打开(创建)临时文件: {e} [{filename}]"
567                )))
568            }
569        };
570
571        if temp_file.write(body.as_slice()).is_ok() {
572            if content_type.is_empty() {
573                content_type = path_temp_dir
574                    .extension()
575                    .unwrap_or(OsStr::new("unknown"))
576                    .to_str()
577                    .unwrap_or("unknown")
578                    .to_string();
579            }
580
581            self.files[md5.as_str()] = object! {
582                name:filename,
583                md5:md5.clone(),
584                size:size,
585                "content-type":content_type.clone(),
586                file:temp_dir.to_str()
587            };
588        };
589        Ok(())
590    }
591}
592
593impl Default for AnalyzeEmails {
594    fn default() -> Self {
595        Self {
596            debug: false,
597            header: Default::default(),
598            mime_version: "".to_string(),
599            boundary: "".to_string(),
600            md5: "".to_string(),
601            size: 0,
602            timestamp: 0,
603            datetime: "".to_string(),
604            subject: "".to_string(),
605            from: Default::default(),
606            to: Default::default(),
607            cc: Default::default(),
608            replyto: Default::default(),
609            content_type: "".to_string(),
610            content_transfer_encoding: ContentTransferEncoding::None,
611            sender: "".to_string(),
612            body_text: "".to_string(),
613            body_html: "".to_string(),
614            files: JsonValue::Null,
615            charset: "".to_string(),
616        }
617    }
618}
619
620/// 编码规则
621/// 选择 Content-Transfer-Encoding 的原则
622///
623/// 纯文本: 如果内容是纯文本且只包含 ASCII 字符,通常使用 7bit。
624/// 非 ASCII 文本: 如果内容包含非 ASCII 字符,可以使用 quoted-printable 或 8bit,具体取决于内容和兼容性要求。
625/// 二进制数据: 对于图像、视频、音频等二进制数据,通常使用 base64 编码。
626#[derive(Debug)]
627pub enum ContentTransferEncoding {
628    /// 这种编码方式主要用于编码文本数据,它保持大部分文本的可读性,但会对非 ASCII 字符和特殊字符(如 =, ?, & 等)进行编码,以确保兼容性。
629    /// 适用于包含大量特殊字符或非 ASCII 文本的邮件内容。
630    QuotedPrintable,
631    ///    将二进制数据编码为 ASCII 字符串,使用 64 个字符的字母表(A-Z, a-z, 0-9, +, /)表示二进制数据。每 3 个字节的二进制数据编码为 4 个字符,便于在邮件中传输。
632    /// 常用于编码附件、图像、音频、视频等二进制数据。
633    Base64,
634    /// 表示内容是二进制数据,不能被转义或编码,必须保持原始的二进制格式进行传输。这种编码方式通常用于图像、音频等二进制文件。
635    /// 这种编码要求邮件传输代理能够处理所有可能的字节值,几乎不做任何转换,因此也不是所有系统都支持。
636    Binary,
637    /// 表示内容包含 8 位字符,这意味着它可能包含非 ASCII 字符(如带有音标的字母)。尽管这样编码的邮件可以包含更多字符,但并非所有邮件传输代理都支持 8bit 传输。
638    /// 适用于非 ASCII 的文本数据,但需要确保邮件传输链路支持 8bit 数据传输。
639    Bit8,
640    /// 表示内容是 ASCII 文本,仅包含 7 位字符(即标准 ASCII 字符集),每个字符的最高位是 0。这种编码方式是最常用的,因为它适合绝大多数邮件传输系统。
641    /// 适用于纯文本邮件,不包含任何特殊字符或二进制数据。
642    Bit7,
643    None,
644}
645
646impl ContentTransferEncoding {
647    fn from(value: &str) -> Self {
648        match value.to_lowercase().as_str() {
649            "7bit" => Self::Bit7,
650            "8bit" => Self::Bit8,
651            "binary" => Self::Binary,
652            "base64" => Self::Base64,
653            "quoted-printable" => Self::QuotedPrintable,
654            _ => Self::None,
655        }
656    }
657    fn decode(&mut self, mut data: Vec<u8>) -> io::Result<Vec<u8>> {
658        let res = match self {
659            ContentTransferEncoding::QuotedPrintable => br_crypto::qp::decode(data)?,
660            ContentTransferEncoding::Base64 => {
661                let str = unsafe { String::from_utf8_unchecked(data) };
662                let mut text = "".to_string();
663                for line in str.lines() {
664                    text += line;
665                }
666                data = text.leak().as_bytes().to_vec();
667                br_crypto::base64::decode_u8(data)
668            }
669            ContentTransferEncoding::Binary => data,
670            ContentTransferEncoding::Bit8 => data,
671            ContentTransferEncoding::Bit7 => data,
672            ContentTransferEncoding::None => data,
673        };
674        Ok(res)
675    }
676}