br_email/
analyze.rs

1use br_crypto::encoding::code_to_utf8;
2use chrono::{DateTime, Local, TimeZone};
3use json::{object, JsonValue};
4use regex::Regex;
5use std::collections::HashMap;
6use std::ffi::OsStr;
7use std::io::{Error, ErrorKind, Write};
8use std::{env, fs, io};
9
10/// 解析邮件
11#[derive(Debug)]
12pub struct AnalyzeEmails {
13    pub debug: bool,
14    pub header: HashMap<String, String>,
15    pub mime_version: String,
16    boundary: String,
17    pub md5: String,
18    pub size: usize,
19    /// 时间戳
20    pub timestamp: i64,
21    /// 本地时间
22    pub datetime: String,
23    /// 主题
24    pub subject: String,
25    /// 发件人
26    pub from: HashMap<String, String>,
27    /// 收件人
28    pub to: HashMap<String, String>,
29    /// 抄送人
30    pub cc: HashMap<String, String>,
31    /// 用于指定收件人回复邮件时应该使用的电子邮件地址
32    pub replyto: HashMap<String, String>,
33    /// 内容类型
34    pub content_type: String,
35    /// 编码规则
36    pub content_transfer_encoding: ContentTransferEncoding,
37    /// 实际发件人
38    pub sender: String,
39    pub body_text: String,
40    pub body_html: String,
41    pub files: JsonValue,
42    pub charset: String,
43}
44
45impl AnalyzeEmails {
46    pub fn new(mut data: Vec<u8>, debug: bool) -> io::Result<AnalyzeEmails> {
47        let md5 = br_crypto::md5::encrypt_hex(&data.clone()).leak().to_string();
48        let size = data.len();
49        let data_string = unsafe { String::from_utf8_unchecked(data.clone()) };
50        if data_string.contains("\n\n") {
51            let updated_string = data_string.replace("\n", "\r\n");
52            data = updated_string.as_bytes().to_vec();
53        }
54
55        let subsequence = "\r\n\r\n".as_bytes();
56
57        let (header, body) = match data
58            .windows(subsequence.len())
59            .position(|window| window == subsequence)
60        {
61            None => {
62                if debug {
63                    fs::write(
64                        format!(
65                            "{}/xygs-{}.eml",
66                            env::current_dir().unwrap().to_str().unwrap(),
67                            md5
68                        ),
69                        data.clone(),
70                    )?;
71                }
72                return Err(Error::other(format!("协议格式错误: {md5}")));
73            }
74            Some(e) => (data[..e].to_vec(), data[e + 4..].to_vec()),
75        };
76        let mut that = Self {
77            debug,
78            header: Default::default(),
79            mime_version: "".to_string(),
80            boundary: "".to_string(),
81            md5,
82            size,
83            timestamp: 0,
84            subject: "".to_string(),
85            from: Default::default(),
86            to: Default::default(),
87            cc: Default::default(),
88            replyto: Default::default(),
89            datetime: "".to_string(),
90            content_type: "".to_string(),
91            content_transfer_encoding: ContentTransferEncoding::Bit7,
92            sender: "".to_string(),
93            body_text: "".to_string(),
94            body_html: "".to_string(),
95            files: object! {},
96            charset: "utf-8".to_string(),
97        };
98        that.header(header)?;
99        that.body(body, data_string)?;
100        Ok(that)
101    }
102
103    fn header(&mut self, data: Vec<u8>) -> io::Result<()> {
104        let data = unsafe { String::from_utf8_unchecked(data) };
105        let data = data.replace("\r\n\t", "").replace("\r\n ", " ").leak();
106        for item in data.lines() {
107            let (key, value) = match item.find(": ") {
108                Some(e) => (&item[..e], &item[e + 2..]),
109                None => match item.find(":") {
110                    Some(e) => (&item[..e], &item[e + 1..]),
111                    None => continue,
112                },
113            };
114            let name = key.to_lowercase().leak();
115            if value.is_empty() {
116                continue;
117            }
118            match key.to_lowercase().as_str() {
119                "mime-version" => self.mime_version = value.to_string(),
120                "from" => {
121                    self.from = self.from(value);
122                }
123                "sender" => {
124                    self.sender = value.to_string();
125                }
126                "to" => {
127                    self.to = self.email_encoded(value);
128                }
129                "cc" => {
130                    self.cc = self.email_encoded(value);
131                }
132                "reply-to" => {
133                    self.replyto = self.email_encoded(value);
134                }
135                "subject" => {
136                    self.subject = self.subject(value.to_string());
137                }
138                "content-type" => {
139                    let types = value.split(";").collect::<Vec<&str>>();
140                    self.content_type = types[0].trim().to_lowercase().to_string();
141                    match self.content_type.as_str() {
142                        "multipart/mixed"
143                        | "multipart/alternative"
144                        | "multipart/related"
145                        | "multipart/report" => match types[1].find("boundary=") {
146                            None => {}
147                            Some(e) => {
148                                let boundary = &types[1][e..];
149                                self.boundary = boundary
150                                    .trim()
151                                    .trim_start_matches("boundary=")
152                                    .trim_start_matches("\"")
153                                    .trim_end_matches("\"")
154                                    .to_string();
155                            }
156                        },
157                        _ => {}
158                    }
159                    if types.len() > 1 {
160                        for item in types.iter() {
161                            if item.contains("charset=") {
162                                self.charset = item
163                                    .trim_start_matches("charset=")
164                                    .trim_start_matches("\"")
165                                    .trim_end_matches("\"")
166                                    .to_string();
167                            }
168                        }
169                    }
170                }
171                "content-transfer-encoding" => {
172                    self.content_transfer_encoding = ContentTransferEncoding::from(value);
173                }
174                "date" => self.datetime(value)?,
175                _ => {
176                    self.header
177                        .insert(name.trim().to_string(), value.to_string());
178                }
179            }
180        }
181        Ok(())
182    }
183    fn body(&mut self, data: Vec<u8>, old_data: String) -> io::Result<()> {
184        match self.content_type.to_lowercase().as_str() {
185            "text/html" => {
186                let data = self.content_transfer_encoding.decode(data)?;
187                let res = code_to_utf8(self.charset.as_str(), data.clone());
188                self.body_html = res;
189            }
190            "text/plain" => {
191                let data = self.content_transfer_encoding.decode(data)?;
192                let res = code_to_utf8(self.charset.as_str(), data.clone());
193                self.body_text = res;
194            }
195            "multipart/mixed"
196            | "multipart/alternative"
197            | "multipart/related"
198            | "multipart/report" => {
199                let data = self.content_transfer_encoding.decode(data.clone())?;
200                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
201                let mut parts_list = vec![];
202                let mut text = String::new();
203
204                parts = match parts.find(self.boundary.as_str()) {
205                    None => parts,
206                    Some(e) => parts[e..].to_string(),
207                };
208                for item in parts.lines() {
209                    if item.contains(self.boundary.as_str()) && text.is_empty() {
210                        continue;
211                    }
212                    if item.contains(self.boundary.as_str()) && text.clone() != "" {
213                        parts_list.push(text.clone());
214                        text = String::new();
215                        continue;
216                    }
217                    text = format!("{text}{item}\r\n");
218                }
219                for part in parts_list {
220                    if part.trim().is_empty() {
221                        continue;
222                    }
223                    self.parts(part.to_string(), old_data.clone())?;
224                }
225            }
226            _ => {
227                return Err(Error::new(
228                    ErrorKind::NotFound,
229                    format!("未知body类型: {}", self.content_type),
230                ));
231            }
232        }
233        Ok(())
234    }
235    /// 部分内容处理
236    fn parts(&mut self, data: String, old_data: String) -> io::Result<()> {
237        let (header, body) = match data.find("\r\n\r\n") {
238            None => {
239                if self.debug {
240                    fs::write(
241                        format!(
242                            "{}/head-{}.eml",
243                            env::current_dir().unwrap().to_str().unwrap(),
244                            self.md5
245                        ),
246                        old_data.clone(),
247                    )?;
248                }
249                return Err(Error::other("解析附件头失败"));
250            }
251            Some(e) => (
252                &data[..e]
253                    .replace("\r\n\t", " ")
254                    .replace("\r\n ", " ")
255                    .leak()
256                    .lines(),
257                &data[e + 4..],
258            ),
259        };
260
261        let mut filename = "".to_string();
262        let mut content_type = "";
263        let mut boundary = "";
264        let mut content_transfer_encoding = ContentTransferEncoding::None;
265        for item in header.clone() {
266            let (key, value) = match item.find(": ") {
267                Some(e) => (&item[..e], &item[e + 2..]),
268                None => match item.find(":") {
269                    Some(e) => (&item[..e], &item[e + 1..]),
270                    None => continue,
271                },
272            };
273
274            let name = key.to_lowercase();
275
276            match name.trim() {
277                "content-transfer-encoding" => {
278                    content_transfer_encoding = ContentTransferEncoding::from(value)
279                }
280                "content-type" => {
281                    let types = value.trim().split(";").collect::<Vec<&str>>();
282                    content_type = types[0].trim();
283                    let name = types
284                        .iter()
285                        .filter(|&x| x.trim().starts_with("name="))
286                        .map(|&x| x.trim().to_string())
287                        .collect::<Vec<String>>();
288                    if !name.is_empty() {
289                        let name = name[0].trim_start_matches("name=");
290                        filename = self.encoded(name);
291                    }
292                    match value.find("boundary=") {
293                        None => {}
294                        Some(i) => {
295                            boundary = &value[i + 9..];
296                            boundary = match boundary.find(";") {
297                                None => boundary,
298                                Some(i) => &boundary[..i],
299                            };
300                            boundary = boundary.trim_start_matches("\"").trim_end_matches("\"");
301                        }
302                    }
303                }
304                "content-id"
305                | "content-length"
306                | "mime-version"
307                | "content-description"
308                | "date"
309                | "x-attachment-id" => {}
310                "content-disposition" => {
311                    if filename.is_empty() && value.contains("filename=") {
312                        filename = value.split("filename=").collect::<Vec<&str>>()[1]
313                            .trim_start_matches("\"")
314                            .trim_end_matches("\"")
315                            .to_string();
316                    }
317                    if filename.is_empty() && value.contains("filename*=utf-8''") {
318                        filename = value.split("filename*=utf-8''").collect::<Vec<&str>>()[1]
319                            .trim_start_matches("\"")
320                            .trim_end_matches("\"")
321                            .to_string();
322                        filename = br_crypto::encoding::urlencoding_decode(filename.as_str());
323                    }
324                }
325                _ => {
326                    return Err(Error::new(
327                        ErrorKind::NotFound,
328                        format!("parts 未知 header 类型: {name} [{item}]"),
329                    ));
330                }
331            }
332        }
333
334        match content_type {
335            "text/plain" => {
336                if filename.is_empty() {
337                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
338                    let text = code_to_utf8(self.charset.as_str(), res.clone());
339                    self.body_text = text;
340                } else {
341                    self.set_files(content_transfer_encoding, body, filename.as_str(), "".to_string())?;
342                }
343            }
344            "text/html" | "text/x-amp-html" => {
345                if filename.is_empty() {
346                    let res = content_transfer_encoding.decode(body.as_bytes().to_vec())?;
347                    self.body_html = code_to_utf8(self.charset.as_str(), res.clone());
348                } else {
349                    self.set_files(content_transfer_encoding, body, filename.as_str(), "".to_string())?;
350                }
351            }
352            "multipart/mixed" | "multipart/alternative" | "multipart/related" => {
353                let data = self
354                    .content_transfer_encoding
355                    .decode(body.as_bytes().to_vec())?;
356                let mut parts = code_to_utf8(self.charset.as_str(), data.clone());
357
358                parts = match parts.find(self.boundary.as_str()) {
359                    None => parts,
360                    Some(e) => parts[e..].to_string(),
361                };
362
363                let mut parts_list = vec![];
364                let mut text = String::new();
365                for item in parts.lines() {
366                    if item.contains(boundary) && text.is_empty() {
367                        continue;
368                    }
369                    if item.contains(boundary) && !text.is_empty() {
370                        parts_list.push(text);
371                        text = String::new();
372                        continue;
373                    }
374                    text = format!("{text}{item}\r\n");
375                }
376                for part in parts_list {
377                    if part.trim().is_empty() {
378                        continue;
379                    }
380                    self.parts(part.to_string(), old_data.clone())?;
381                }
382            }
383            "text/calendar" => {}
384            "application/octet-stream"
385            | "application/zip"
386            | "application/pdf"
387            | "image/jpeg"
388            | "application/ics"
389            | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
390                if !filename.is_empty() {
391                    self.set_files(
392                        content_transfer_encoding,
393                        body,
394                        filename.as_str(),
395                        content_type.to_string(),
396                    )?;
397                }
398            }
399            _ => {
400                if self.debug {
401                    fs::write(
402                        format!(
403                            "{}/content_type-{}.eml",
404                            env::current_dir().unwrap().to_str().unwrap(),
405                            self.md5
406                        ),
407                        old_data.clone(),
408                    )?;
409                }
410                return Err(Error::new(
411                    ErrorKind::NotFound,
412                    format!("未知 parts content_type 类型: {content_type}"),
413                ));
414            }
415        }
416        Ok(())
417    }
418    pub fn from(&mut self, value: &str) -> HashMap<String, String> {
419        let mut r = value
420            .split("<")
421            .filter(|x| !x.trim().is_empty())
422            .map(|x| x.trim())
423            .collect::<Vec<&str>>();
424        if r[0].starts_with("\"") && r[0].ends_with("\"") {
425            r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"").trim();
426        }
427        let mut emails = HashMap::new();
428        if r.len() == 1 {
429            let name = r[0].trim_end_matches(">").to_string();
430            emails.insert(name.clone(), name);
431        } else {
432            let name = self.encoded(r[0].trim());
433            let email = r[1].trim_end_matches(">").to_string();
434            emails.insert(email, name);
435        }
436        emails
437    }
438    fn subject(&mut self, value: String) -> String {
439        let value = value.replace("?==?", "?=\r\n\t=?");
440        if !value.contains("=?") && !value.contains("?=") {
441            return value.to_string();
442        }
443        let list = value.split("\r\n\t").collect::<Vec<&str>>();
444        let mut txt = vec![];
445        for item in list {
446            txt.push(self.encoded(item));
447        }
448        txt.join("")
449    }
450
451    fn encoded(&mut self, value: &str) -> String {
452        let t = value.trim_start_matches("\"").trim_end_matches("\"");
453        if t.contains("=?") && t.contains("?=") {
454            let l = t.split(" ").collect::<Vec<&str>>();
455            let mut txt = vec![];
456            for item in l {
457                txt.push(self.encoded_line(item));
458            }
459            txt.join("")
460        } else {
461            t.to_string()
462        }
463    }
464    /// 段落解码
465    fn encoded_line(&mut self, value: &str) -> String {
466        let line = value.split("?").collect::<Vec<&str>>();
467        if line.len() == 1 {
468            return value.to_string();
469        }
470        let charset = line[1].to_lowercase().to_string().leak();
471        let code = line[2].to_uppercase();
472        let data = line[3];
473
474        let strs = match code.as_str() {
475            "B" => br_crypto::base64::decode_u8(data),
476            "Q" => br_crypto::qp::decode(data).unwrap_or(vec![]),
477            _ => data.as_bytes().to_vec(),
478        };
479        let text = code_to_utf8(charset, strs.clone());
480        text.chars().filter(|&x| x != '\u{200b}').collect()
481    }
482
483    /// 时间处理
484    fn datetime(&mut self, value: &str) -> io::Result<()> {
485        let re = Regex::new(r"\s*\(.*\)$").unwrap();
486        let datetime = re.replace(value, "").to_string();
487        let datetime = datetime.replace("GMT", "+0000").to_string();
488        let datetime = match datetime.find(",") {
489            None => datetime,
490            Some(i) => datetime[i + 1..].parse().unwrap(),
491        };
492        let datetime = match DateTime::parse_from_str(datetime.as_str(), "%d %b %Y %H:%M:%S %z") {
493            Ok(e) => e,
494            Err(e) => {
495                return Err(Error::other(
496                    format!("时间解析失败: {e} [{datetime:?}]"),
497                ))
498            }
499        };
500        self.timestamp = datetime.timestamp();
501        self.datetime = Local
502            .timestamp_opt(self.timestamp, 0)
503            .unwrap()
504            .with_timezone(&Local)
505            .format("%Y-%m-%d %H:%M:%S")
506            .to_string();
507        Ok(())
508    }
509    pub fn email_encoded(&mut self, value: &str) -> HashMap<String, String> {
510        let list = value.split(",").map(|x| x.trim()).collect::<Vec<&str>>();
511        let mut emails = HashMap::new();
512        for item in list {
513            let mut r = item.split(" <").collect::<Vec<&str>>();
514            if r[0].starts_with("\"") && r[0].ends_with("\"") {
515                r[0] = r[0].trim_start_matches("\"").trim_end_matches("\"");
516            }
517            if r.len() == 1 {
518                let name = r[0]
519                    .trim_start_matches("<")
520                    .trim_end_matches(">")
521                    .to_string();
522                emails.insert(name.clone(), name);
523            } else {
524                let name = self.encoded(r[0].trim());
525                let email = r[1].trim_end_matches(">").to_string();
526                emails.insert(email, name);
527            }
528        }
529        emails
530    }
531    fn set_files(
532        &mut self,
533        mut content_transfer_encoding: ContentTransferEncoding,
534        body: &str,
535        filename: &str,
536        mut content_type: String,
537    ) -> io::Result<()> {
538        let mut data = "";
539        if let ContentTransferEncoding::Base64 = content_transfer_encoding {
540            let mut text = "".to_string();
541            for line in body.lines() {
542                text += line;
543            }
544            data = text.leak();
545        }
546
547        let body = content_transfer_encoding.decode(data.as_bytes().to_vec())?;
548        let md5 = br_crypto::md5::encrypt_hex(&body.clone());
549        let size = body.len();
550        let mut temp_dir = env::temp_dir();
551        temp_dir.push(filename);
552        let path_temp_dir=temp_dir.clone();
553
554        let mut temp_file = match fs::File::create(temp_dir.clone()) {
555            Ok(e) => e,
556            Err(e) => {
557                return Err(Error::other(
558                    format!("打开(创建)临时文件: {e} [{filename}]"),
559                ))
560            }
561        };
562
563        if temp_file.write(body.as_slice()).is_ok() {
564            if content_type.is_empty() {
565                content_type = path_temp_dir.extension()
566                    .unwrap_or(OsStr::new("unknown"))
567                    .to_str()
568                    .unwrap_or("unknown").to_string();
569            }
570
571            self.files[md5.as_str()] = object! {
572                name:filename,
573                md5:md5.clone(),
574                size:size,
575                "content-type":content_type.clone(),
576                file:temp_dir.to_str()
577            };
578        };
579        Ok(())
580    }
581}
582
583impl Default for AnalyzeEmails {
584    fn default() -> Self {
585        Self {
586            debug: false,
587            header: Default::default(),
588            mime_version: "".to_string(),
589            boundary: "".to_string(),
590            md5: "".to_string(),
591            size: 0,
592            timestamp: 0,
593            datetime: "".to_string(),
594            subject: "".to_string(),
595            from: Default::default(),
596            to: Default::default(),
597            cc: Default::default(),
598            replyto: Default::default(),
599            content_type: "".to_string(),
600            content_transfer_encoding: ContentTransferEncoding::None,
601            sender: "".to_string(),
602            body_text: "".to_string(),
603            body_html: "".to_string(),
604            files: JsonValue::Null,
605            charset: "".to_string(),
606        }
607    }
608}
609
610/// 编码规则
611/// 选择 Content-Transfer-Encoding 的原则
612///
613/// 纯文本: 如果内容是纯文本且只包含 ASCII 字符,通常使用 7bit。
614/// 非 ASCII 文本: 如果内容包含非 ASCII 字符,可以使用 quoted-printable 或 8bit,具体取决于内容和兼容性要求。
615/// 二进制数据: 对于图像、视频、音频等二进制数据,通常使用 base64 编码。
616#[derive(Debug)]
617pub enum ContentTransferEncoding {
618    /// 这种编码方式主要用于编码文本数据,它保持大部分文本的可读性,但会对非 ASCII 字符和特殊字符(如 =, ?, & 等)进行编码,以确保兼容性。
619    /// 适用于包含大量特殊字符或非 ASCII 文本的邮件内容。
620    QuotedPrintable,
621    ///    将二进制数据编码为 ASCII 字符串,使用 64 个字符的字母表(A-Z, a-z, 0-9, +, /)表示二进制数据。每 3 个字节的二进制数据编码为 4 个字符,便于在邮件中传输。
622    /// 常用于编码附件、图像、音频、视频等二进制数据。
623    Base64,
624    /// 表示内容是二进制数据,不能被转义或编码,必须保持原始的二进制格式进行传输。这种编码方式通常用于图像、音频等二进制文件。
625    /// 这种编码要求邮件传输代理能够处理所有可能的字节值,几乎不做任何转换,因此也不是所有系统都支持。
626    Binary,
627    /// 表示内容包含 8 位字符,这意味着它可能包含非 ASCII 字符(如带有音标的字母)。尽管这样编码的邮件可以包含更多字符,但并非所有邮件传输代理都支持 8bit 传输。
628    /// 适用于非 ASCII 的文本数据,但需要确保邮件传输链路支持 8bit 数据传输。
629    Bit8,
630    /// 表示内容是 ASCII 文本,仅包含 7 位字符(即标准 ASCII 字符集),每个字符的最高位是 0。这种编码方式是最常用的,因为它适合绝大多数邮件传输系统。
631    /// 适用于纯文本邮件,不包含任何特殊字符或二进制数据。
632    Bit7,
633    None,
634}
635
636impl ContentTransferEncoding {
637    fn from(value: &str) -> Self {
638        match value.to_lowercase().as_str() {
639            "7bit" => Self::Bit7,
640            "8bit" => Self::Bit8,
641            "binary" => Self::Binary,
642            "base64" => Self::Base64,
643            "quoted-printable" => Self::QuotedPrintable,
644            _ => Self::None,
645        }
646    }
647    fn decode(&mut self, mut data: Vec<u8>) -> io::Result<Vec<u8>> {
648        let res = match self {
649            ContentTransferEncoding::QuotedPrintable => br_crypto::qp::decode(data)?,
650            ContentTransferEncoding::Base64 => {
651                let str = unsafe { String::from_utf8_unchecked(data) };
652                let mut text = "".to_string();
653                for line in str.lines() {
654                    text += line;
655                }
656                data = text.leak().as_bytes().to_vec();
657                br_crypto::base64::decode_u8(data)
658            }
659            ContentTransferEncoding::Binary => data,
660            ContentTransferEncoding::Bit8 => data,
661            ContentTransferEncoding::Bit7 => data,
662            ContentTransferEncoding::None => data,
663        };
664        Ok(res)
665    }
666}