br_email/
analyze.rs

1use br_crypto::encoding::code_to_utf8;
2use br_crypto::hash::u8_to_md5;
3use chrono::{DateTime, Local, TimeZone};
4use json::{object, JsonValue};
5use regex::Regex;
6use std::collections::HashMap;
7use std::ffi::OsStr;
8use std::io::{Error, ErrorKind, Write};
9use std::{env, fs, io};
10
11/// 解析邮件
12#[derive(Debug)]
13pub struct AnalyzeEmails {
14    pub debug: bool,
15    pub header: HashMap<String, String>,
16    pub mime_version: String,
17    boundary: String,
18    pub md5: String,
19    pub size: usize,
20    /// 时间戳
21    pub timestamp: i64,
22    /// 本地时间
23    pub datetime: String,
24    /// 主题
25    pub subject: String,
26    /// 发件人
27    pub from: HashMap<String, String>,
28    /// 收件人
29    pub to: HashMap<String, String>,
30    /// 抄送人
31    pub cc: HashMap<String, String>,
32    /// 用于指定收件人回复邮件时应该使用的电子邮件地址
33    pub replyto: HashMap<String, String>,
34    /// 内容类型
35    pub content_type: String,
36    /// 编码规则
37    pub content_transfer_encoding: ContentTransferEncoding,
38    /// 实际发件人
39    pub sender: String,
40    pub body_text: String,
41    pub body_html: String,
42    pub files: JsonValue,
43    pub charset: String,
44}
45
46impl AnalyzeEmails {
47    pub fn new(mut data: Vec<u8>, debug: bool) -> io::Result<AnalyzeEmails> {
48        let md5 = u8_to_md5(data.clone()).leak().to_string();
49        let size = data.len();
50        let data_string = unsafe { String::from_utf8_unchecked(data.clone()) };
51        if data_string.contains("\n\n") {
52            let updated_string = data_string.replace("\n", "\r\n");
53            data = updated_string.as_bytes().to_vec();
54        }
55
56        let subsequence = "\r\n\r\n".as_bytes();
57
58        let (header, body) = match data
59            .windows(subsequence.len())
60            .position(|window| window == subsequence)
61        {
62            None => {
63                if debug {
64                    fs::write(
65                        format!(
66                            "{}/xygs-{}.eml",
67                            env::current_dir().unwrap().to_str().unwrap(),
68                            md5
69                        ),
70                        data.clone(),
71                    )?;
72                }
73                return Err(Error::new(ErrorKind::Other, format!("协议格式错误: {md5}")));
74            }
75            Some(e) => (data[..e].to_vec(), data[e + 4..].to_vec()),
76        };
77        let mut that = Self {
78            debug,
79            header: Default::default(),
80            mime_version: "".to_string(),
81            boundary: "".to_string(),
82            md5,
83            size,
84            timestamp: 0,
85            subject: "".to_string(),
86            from: Default::default(),
87            to: Default::default(),
88            cc: Default::default(),
89            replyto: Default::default(),
90            datetime: "".to_string(),
91            content_type: "".to_string(),
92            content_transfer_encoding: ContentTransferEncoding::Bit7,
93            sender: "".to_string(),
94            body_text: "".to_string(),
95            body_html: "".to_string(),
96            files: object! {},
97            charset: "utf-8".to_string(),
98        };
99        that.header(header)?;
100        that.body(body, data_string)?;
101        Ok(that)
102    }
103
104    fn header(&mut self, data: Vec<u8>) -> io::Result<()> {
105        let data = unsafe { String::from_utf8_unchecked(data) };
106        let data = data.replace("\r\n\t", "").replace("\r\n ", " ").leak();
107        for item in data.lines() {
108            let (key, value) = match item.find(": ") {
109                Some(e) => (&item[..e], &item[e + 2..]),
110                None => match item.find(":") {
111                    Some(e) => (&item[..e], &item[e + 1..]),
112                    None => continue,
113                },
114            };
115            let name = key.to_lowercase().leak();
116            if value.is_empty() {
117                continue;
118            }
119            match key.to_lowercase().as_str() {
120                "mime-version" => self.mime_version = value.to_string(),
121                "from" => {
122                    self.from = self.from(value);
123                }
124                "sender" => {
125                    self.sender = value.to_string();
126                }
127                "to" => {
128                    self.to = self.email_encoded(value);
129                }
130                "cc" => {
131                    self.cc = self.email_encoded(value);
132                }
133                "reply-to" => {
134                    self.replyto = self.email_encoded(value);
135                }
136                "subject" => {
137                    self.subject = self.subject(value.to_string());
138                }
139                "content-type" => {
140                    let types = value.split(";").collect::<Vec<&str>>();
141                    self.content_type = types[0].trim().to_lowercase().to_string();
142                    match self.content_type.as_str() {
143                        "multipart/mixed"
144                        | "multipart/alternative"
145                        | "multipart/related"
146                        | "multipart/report" => match types[1].find("boundary=") {
147                            None => {}
148                            Some(e) => {
149                                let boundary = &types[1][e..];
150                                self.boundary = boundary
151                                    .trim()
152                                    .trim_start_matches("boundary=")
153                                    .trim_start_matches("\"")
154                                    .trim_end_matches("\"")
155                                    .to_string();
156                            }
157                        },
158                        _ => {}
159                    }
160                    if types.len() > 1 {
161                        for item in types.iter() {
162                            if item.contains("charset=") {
163                                self.charset = item
164                                    .trim_start_matches("charset=")
165                                    .trim_start_matches("\"")
166                                    .trim_end_matches("\"")
167                                    .to_string();
168                            }
169                        }
170                    }
171                }
172                "content-transfer-encoding" => {
173                    self.content_transfer_encoding = ContentTransferEncoding::from(value);
174                }
175                "date" => self.datetime(value)?,
176                _ => {
177                    self.header
178                        .insert(name.trim().to_string(), value.to_string());
179                }
180            }
181        }
182        Ok(())
183    }
184    fn body(&mut self, data: Vec<u8>, old_data: String) -> io::Result<()> {
185        match self.content_type.to_lowercase().as_str() {
186            "text/html" => {
187                let data = self.content_transfer_encoding.decode(data)?;
188                let res = code_to_utf8(self.charset.as_str(), data.clone());
189                self.body_html = res;
190            }
191            "text/plain" => {
192                let data = self.content_transfer_encoding.decode(data)?;
193                let res = code_to_utf8(self.charset.as_str(), data.clone());
194                self.body_text = res;
195            }
196            "multipart/mixed"
197            | "multipart/alternative"
198            | "multipart/related"
199            | "multipart/report" => {
200                let data = self.content_transfer_encoding.decode(data.clone())?;
201                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
202                let mut parts_list = vec![];
203                let mut text = String::new();
204
205                parts = match parts.find(self.boundary.as_str()) {
206                    None => parts,
207                    Some(e) => parts[e..].to_string(),
208                };
209                for item in parts.lines() {
210                    if item.contains(self.boundary.as_str()) && text.is_empty() {
211                        continue;
212                    }
213                    if item.contains(self.boundary.as_str()) && text.clone() != "" {
214                        parts_list.push(text.clone());
215                        text = String::new();
216                        continue;
217                    }
218                    text = format!("{}{}\r\n", text, item);
219                }
220                for part in parts_list {
221                    if part.trim().is_empty() {
222                        continue;
223                    }
224                    self.parts(part.to_string(), old_data.clone())?;
225                }
226            }
227            _ => {
228                return Err(Error::new(
229                    ErrorKind::NotFound,
230                    format!("未知body类型: {}", self.content_type),
231                ));
232            }
233        }
234        Ok(())
235    }
236    /// 部分内容处理
237    fn parts(&mut self, data: String, old_data: String) -> io::Result<()> {
238        let (header, body) = match data.find("\r\n\r\n") {
239            None => {
240                if self.debug {
241                    fs::write(
242                        format!(
243                            "{}/head-{}.eml",
244                            env::current_dir().unwrap().to_str().unwrap(),
245                            self.md5
246                        ),
247                        old_data.clone(),
248                    )?;
249                }
250                return Err(Error::new(ErrorKind::Other, "解析附件头失败"));
251            }
252            Some(e) => (
253                &data[..e]
254                    .replace("\r\n\t", " ")
255                    .replace("\r\n ", " ")
256                    .leak()
257                    .lines(),
258                &data[e + 4..],
259            ),
260        };
261
262        let mut filename = "".to_string();
263        let mut content_type = "";
264        let mut boundary = "";
265        let mut content_transfer_encoding = ContentTransferEncoding::None;
266        for item in header.clone() {
267            let (key, value) = match item.find(": ") {
268                Some(e) => (&item[..e], &item[e + 2..]),
269                None => match item.find(":") {
270                    Some(e) => (&item[..e], &item[e + 1..]),
271                    None => continue,
272                },
273            };
274
275            let name = key.to_lowercase();
276
277            match name.trim() {
278                "content-transfer-encoding" => {
279                    content_transfer_encoding = ContentTransferEncoding::from(value)
280                }
281                "content-type" => {
282                    let types = value.trim().split(";").collect::<Vec<&str>>();
283                    content_type = types[0].trim();
284                    let name = types
285                        .iter()
286                        .filter(|&x| x.trim().starts_with("name="))
287                        .map(|&x| x.trim().to_string())
288                        .collect::<Vec<String>>();
289                    if !name.is_empty() {
290                        let name = name[0].trim_start_matches("name=");
291                        filename = self.encoded(name);
292                    }
293                    match value.find("boundary=") {
294                        None => {}
295                        Some(i) => {
296                            boundary = &value[i + 9..];
297                            boundary = match boundary.find(";") {
298                                None => boundary,
299                                Some(i) => &boundary[..i],
300                            };
301                            boundary = boundary.trim_start_matches("\"").trim_end_matches("\"");
302                        }
303                    }
304                }
305                "content-id"
306                | "content-length"
307                | "mime-version"
308                | "content-description"
309                | "date"
310                | "x-attachment-id" => {}
311                "content-disposition" => {
312                    if filename.is_empty() && value.contains("filename=") {
313                        filename = value.split("filename=").collect::<Vec<&str>>()[1]
314                            .trim_start_matches("\"")
315                            .trim_end_matches("\"")
316                            .to_string();
317                    }
318                    if filename.is_empty() && value.contains("filename*=utf-8''") {
319                        filename = value.split("filename*=utf-8''").collect::<Vec<&str>>()[1]
320                            .trim_start_matches("\"")
321                            .trim_end_matches("\"")
322                            .to_string();
323                        filename = br_crypto::encoding::urlencoding_decode(filename.as_str());
324                    }
325                }
326                _ => {
327                    return Err(Error::new(
328                        ErrorKind::NotFound,
329                        format!("parts 未知 header 类型: {} [{}]", name, item),
330                    ));
331                }
332            }
333        }
334
335        match content_type {
336            "text/plain" => {
337                if filename.is_empty() {
338                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
339                    let text = code_to_utf8(self.charset.as_str(), res.clone());
340                    self.body_text = text;
341                } else {
342                    self.set_files(content_transfer_encoding, body, filename.as_str(), "".to_string())?;
343                }
344            }
345            "text/html" | "text/x-amp-html" => {
346                if filename.is_empty() {
347                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
348                    self.body_html = code_to_utf8(self.charset.as_str(), res.clone());
349                } else {
350                    self.set_files(content_transfer_encoding, body, filename.as_str(), "".to_string())?;
351                }
352            }
353            "multipart/mixed" | "multipart/alternative" | "multipart/related" => {
354                let data = self
355                    .content_transfer_encoding
356                    .decode(body.as_bytes().to_vec())?;
357                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
358
359                parts = match parts.find(self.boundary.as_str()) {
360                    None => parts,
361                    Some(e) => parts[e..].to_string(),
362                };
363
364                let mut parts_list = vec![];
365                let mut text = String::new();
366                for item in parts.lines() {
367                    if item.contains(boundary) && text.is_empty() {
368                        continue;
369                    }
370                    if item.contains(boundary) && !text.is_empty() {
371                        parts_list.push(text);
372                        text = String::new();
373                        continue;
374                    }
375                    text = format!("{}{}\r\n", text, item);
376                }
377                for part in parts_list {
378                    if part.trim().is_empty() {
379                        continue;
380                    }
381                    self.parts(part.to_string(), old_data.clone())?;
382                }
383            }
384            "text/calendar" => {}
385            "application/octet-stream"
386            | "application/zip"
387            | "application/pdf"
388            | "image/jpeg"
389            | "application/ics"
390            | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
391                if !filename.is_empty() {
392                    self.set_files(
393                        content_transfer_encoding,
394                        body,
395                        filename.as_str(),
396                        content_type.to_string(),
397                    )?;
398                }
399            }
400            _ => {
401                if self.debug {
402                    fs::write(
403                        format!(
404                            "{}/content_type-{}.eml",
405                            env::current_dir().unwrap().to_str().unwrap(),
406                            self.md5
407                        ),
408                        old_data.clone(),
409                    )?;
410                }
411                return Err(Error::new(
412                    ErrorKind::NotFound,
413                    format!("未知 parts content_type 类型: {}", content_type),
414                ));
415            }
416        }
417        Ok(())
418    }
419    pub fn from(&mut self, value: &str) -> HashMap<String, String> {
420        let mut r = value
421            .split("<")
422            .filter(|x| !x.trim().is_empty())
423            .map(|x| x.trim())
424            .collect::<Vec<&str>>();
425        if r[0].starts_with("\"") && r[0].ends_with("\"") {
426            r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"").trim();
427        }
428        let mut emails = HashMap::new();
429        if r.len() == 1 {
430            let name = r[0].trim_end_matches(">").to_string();
431            emails.insert(name.clone(), name);
432        } else {
433            let name = self.encoded(r[0].trim());
434            let email = r[1].trim_end_matches(">").to_string();
435            emails.insert(email, name);
436        }
437        emails
438    }
439    fn subject(&mut self, value: String) -> String {
440        let value = value.replace("?==?", "?=\r\n\t=?");
441        if !value.contains("=?") && !value.contains("?=") {
442            return value.to_string();
443        }
444        let list = value.split("\r\n\t").collect::<Vec<&str>>();
445        let mut txt = vec![];
446        for item in list {
447            txt.push(self.encoded(item));
448        }
449        txt.join("")
450    }
451
452    fn encoded(&mut self, value: &str) -> String {
453        let t = value.trim_start_matches("\"").trim_end_matches("\"");
454        if t.contains("=?") && t.contains("?=") {
455            let l = t.split(" ").collect::<Vec<&str>>();
456            let mut txt = vec![];
457            for item in l {
458                txt.push(self.encoded_line(item));
459            }
460            txt.join("")
461        } else {
462            t.to_string()
463        }
464    }
465    /// 段落解码
466    fn encoded_line(&mut self, value: &str) -> String {
467        let line = value.split("?").collect::<Vec<&str>>();
468        if line.len() == 1 {
469            return value.to_string();
470        }
471        let charset = line[1].to_lowercase().to_string().leak();
472        let code = line[2].to_uppercase();
473        let data = line[3];
474
475        let strs = match code.as_str() {
476            "B" => br_crypto::base64::decode_u8(data),
477            "Q" => br_crypto::qp::decode(data).unwrap_or(vec![]),
478            _ => data.as_bytes().to_vec(),
479        };
480        let text = code_to_utf8(charset, strs.clone());
481        text.chars().filter(|&x| x != '\u{200b}').collect()
482    }
483
484    /// 时间处理
485    fn datetime(&mut self, value: &str) -> io::Result<()> {
486        let re = Regex::new(r"\s*\(.*\)$").unwrap();
487        let datetime = re.replace(value, "").to_string();
488        let datetime = datetime.replace("GMT", "+0000").to_string();
489        let datetime = match datetime.find(",") {
490            None => datetime,
491            Some(i) => datetime[i + 1..].parse().unwrap(),
492        };
493        let datetime = match DateTime::parse_from_str(datetime.as_str(), "%d %b %Y %H:%M:%S %z") {
494            Ok(e) => e,
495            Err(e) => {
496                return Err(Error::new(
497                    ErrorKind::Other,
498                    format!("时间解析失败: {e} [{datetime:?}]"),
499                ))
500            }
501        };
502        self.timestamp = datetime.timestamp();
503        self.datetime = Local
504            .timestamp_opt(self.timestamp, 0)
505            .unwrap()
506            .with_timezone(&Local)
507            .format("%Y-%m-%d %H:%M:%S")
508            .to_string();
509        Ok(())
510    }
511    pub fn email_encoded(&mut self, value: &str) -> HashMap<String, String> {
512        let list = value.split(",").map(|x| x.trim()).collect::<Vec<&str>>();
513        let mut emails = HashMap::new();
514        for item in list {
515            let mut r = item.split(" <").collect::<Vec<&str>>();
516            if r[0].starts_with("\"") && r[0].ends_with("\"") {
517                r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"");
518            }
519            if r.len() == 1 {
520                let name = r[0]
521                    .trim_start_matches("<")
522                    .trim_end_matches(">")
523                    .to_string();
524                emails.insert(name.clone(), name);
525            } else {
526                let name = self.encoded(r[0].trim());
527                let email = r[1].trim_end_matches(">").to_string();
528                emails.insert(email, name);
529            }
530        }
531        emails
532    }
533    fn set_files(
534        &mut self,
535        mut content_transfer_encoding: ContentTransferEncoding,
536        body: &str,
537        filename: &str,
538        mut content_type: String,
539    ) -> io::Result<()> {
540        let mut data = "";
541        if let ContentTransferEncoding::Base64 = content_transfer_encoding {
542            let mut text = "".to_string();
543            for line in body.lines() {
544                text += line;
545            }
546            data = text.leak();
547        }
548
549        let body = content_transfer_encoding.decode(data.as_bytes().to_vec())?;
550        let md5 = u8_to_md5(body.clone());
551        let size = body.len();
552        let mut temp_dir = env::temp_dir();
553        temp_dir.push(filename);
554        let path_temp_dir=temp_dir.clone();
555
556        let mut temp_file = match fs::File::create(temp_dir.clone()) {
557            Ok(e) => e,
558            Err(e) => {
559                return Err(Error::new(
560                    ErrorKind::Other,
561                    format!("打开(创建)临时文件: {} [{}]", e, filename),
562                ))
563            }
564        };
565
566        if temp_file.write(body.as_slice()).is_ok() {
567            if content_type.is_empty() {
568                content_type = path_temp_dir.extension()
569                    .unwrap_or(OsStr::new("unknown"))
570                    .to_str()
571                    .unwrap_or("unknown").to_string();
572            }
573
574            self.files[md5.as_str()] = object! {
575                name:filename,
576                md5:md5.clone(),
577                size:size,
578                "content-type":content_type.clone(),
579                file:temp_dir.to_str()
580            };
581        };
582        Ok(())
583    }
584}
585
586impl Default for AnalyzeEmails {
587    fn default() -> Self {
588        Self {
589            debug: false,
590            header: Default::default(),
591            mime_version: "".to_string(),
592            boundary: "".to_string(),
593            md5: "".to_string(),
594            size: 0,
595            timestamp: 0,
596            datetime: "".to_string(),
597            subject: "".to_string(),
598            from: Default::default(),
599            to: Default::default(),
600            cc: Default::default(),
601            replyto: Default::default(),
602            content_type: "".to_string(),
603            content_transfer_encoding: ContentTransferEncoding::None,
604            sender: "".to_string(),
605            body_text: "".to_string(),
606            body_html: "".to_string(),
607            files: JsonValue::Null,
608            charset: "".to_string(),
609        }
610    }
611}
612
613/// 编码规则
614/// 选择 Content-Transfer-Encoding 的原则
615///
616/// 纯文本: 如果内容是纯文本且只包含 ASCII 字符,通常使用 7bit。
617/// 非 ASCII 文本: 如果内容包含非 ASCII 字符,可以使用 quoted-printable 或 8bit,具体取决于内容和兼容性要求。
618/// 二进制数据: 对于图像、视频、音频等二进制数据,通常使用 base64 编码。
619#[derive(Debug)]
620pub enum ContentTransferEncoding {
621    /// 这种编码方式主要用于编码文本数据,它保持大部分文本的可读性,但会对非 ASCII 字符和特殊字符(如 =, ?, & 等)进行编码,以确保兼容性。
622    /// 适用于包含大量特殊字符或非 ASCII 文本的邮件内容。
623    QuotedPrintable,
624    ///    将二进制数据编码为 ASCII 字符串,使用 64 个字符的字母表(A-Z, a-z, 0-9, +, /)表示二进制数据。每 3 个字节的二进制数据编码为 4 个字符,便于在邮件中传输。
625    /// 常用于编码附件、图像、音频、视频等二进制数据。
626    Base64,
627    /// 表示内容是二进制数据,不能被转义或编码,必须保持原始的二进制格式进行传输。这种编码方式通常用于图像、音频等二进制文件。
628    /// 这种编码要求邮件传输代理能够处理所有可能的字节值,几乎不做任何转换,因此也不是所有系统都支持。
629    Binary,
630    /// 表示内容包含 8 位字符,这意味着它可能包含非 ASCII 字符(如带有音标的字母)。尽管这样编码的邮件可以包含更多字符,但并非所有邮件传输代理都支持 8bit 传输。
631    /// 适用于非 ASCII 的文本数据,但需要确保邮件传输链路支持 8bit 数据传输。
632    Bit8,
633    /// 表示内容是 ASCII 文本,仅包含 7 位字符(即标准 ASCII 字符集),每个字符的最高位是 0。这种编码方式是最常用的,因为它适合绝大多数邮件传输系统。
634    /// 适用于纯文本邮件,不包含任何特殊字符或二进制数据。
635    Bit7,
636    None,
637}
638
639impl ContentTransferEncoding {
640    fn from(value: &str) -> Self {
641        match value.to_lowercase().as_str() {
642            "7bit" => Self::Bit7,
643            "8bit" => Self::Bit8,
644            "binary" => Self::Binary,
645            "base64" => Self::Base64,
646            "quoted-printable" => Self::QuotedPrintable,
647            _ => Self::None,
648        }
649    }
650    fn decode(&mut self, mut data: Vec<u8>) -> io::Result<Vec<u8>> {
651        let res = match self {
652            ContentTransferEncoding::QuotedPrintable => br_crypto::qp::decode(data)?,
653            ContentTransferEncoding::Base64 => {
654                let str = unsafe { String::from_utf8_unchecked(data) };
655                let mut text = "".to_string();
656                for line in str.lines() {
657                    text += line;
658                }
659                data = text.leak().as_bytes().to_vec();
660                br_crypto::base64::decode_u8(data)
661            }
662            ContentTransferEncoding::Binary => data,
663            ContentTransferEncoding::Bit8 => data,
664            ContentTransferEncoding::Bit7 => data,
665            ContentTransferEncoding::None => data,
666        };
667        Ok(res)
668    }
669}