Skip to main content

az_gmail_code/
parser.rs

1use crate::{GmailCodeError, GmailCodeResult, GmailMessage, GmailMessagePart};
2use base64::Engine;
3use base64::engine::general_purpose::URL_SAFE;
4use regex::Regex;
5use std::borrow::Cow;
6use std::sync::LazyLock;
7
8static DEFAULT_CODE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
9    [
10        r#"class=["'](?:code|otp|verification-code)["'][^>]*>\s*([0-9][0-9\s-]{2,14}[0-9])\s*<"#,
11        r"(?i)(?:verification|security|login|one[-\s]?time|auth(?:entication)?|确认|验证|验证码|校验码)\s*(?:code|码)?[::\s]*(\d[\d\s-]{2,14}\d)",
12        r"(?i)(?:code|otp|pin)\s*(?:is|为|是)?[::\s]*(\d[\d\s-]{2,14}\d)",
13        r"(?:^|[^#&[:alnum:]])(\d[\d\s-]{2,14}\d)(?:[^[:alnum:]]|$)",
14    ]
15    .into_iter()
16    .map(|pattern| Regex::new(pattern).expect("verification-code regex should compile"))
17    .collect()
18});
19
20/// Controls numeric verification-code extraction.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct ExtractCodeOptions {
23    /// Minimum digit count after separators are removed.
24    pub min_digits: usize,
25    /// Maximum digit count after separators are removed.
26    pub max_digits: usize,
27}
28
29impl ExtractCodeOptions {
30    /// Creates extraction options. Values are normalized to a practical OTP range.
31    #[must_use]
32    pub const fn new(min_digits: usize, max_digits: usize) -> Self {
33        let min_digits = if min_digits < 3 { 3 } else { min_digits };
34        let max_digits = if max_digits < min_digits {
35            min_digits
36        } else {
37            max_digits
38        };
39        Self {
40            min_digits,
41            max_digits,
42        }
43    }
44}
45
46impl Default for ExtractCodeOptions {
47    fn default() -> Self {
48        Self {
49            min_digits: 4,
50            max_digits: 8,
51        }
52    }
53}
54
55/// Decoded message body candidate inspected for verification codes.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct MessageBodyCandidate {
58    /// MIME type of the Gmail body part.
59    pub mime_type: String,
60    /// Decoded text content.
61    pub text: String,
62}
63
64/// Extracts a 4-8 digit verification code from plain text or HTML.
65#[must_use]
66pub fn extract_verification_code(content: impl AsRef<str>) -> Option<String> {
67    extract_verification_code_with_options(content, ExtractCodeOptions::default())
68}
69
70/// Extracts a numeric verification code using custom digit-length options.
71#[must_use]
72pub fn extract_verification_code_with_options(
73    content: impl AsRef<str>,
74    options: ExtractCodeOptions,
75) -> Option<String> {
76    let content = content.as_ref();
77    if content.trim().is_empty() {
78        return None;
79    }
80
81    let normalized = html_to_searchable_text(content);
82    for pattern in DEFAULT_CODE_PATTERNS.iter() {
83        for capture in pattern.captures_iter(&normalized) {
84            let Some(raw) = capture.get(1).map(|code| code.as_str()) else {
85                continue;
86            };
87            let code = normalize_code(raw);
88            if code.len() >= options.min_digits
89                && code.len() <= options.max_digits
90                && !looks_like_false_positive(&code)
91            {
92                return Some(code);
93            }
94        }
95    }
96    None
97}
98
99/// Collects decoded text/plain and text/html body candidates from a Gmail message.
100pub fn collect_message_body_candidates(
101    message: &GmailMessage,
102) -> GmailCodeResult<Vec<MessageBodyCandidate>> {
103    let mut plain = Vec::new();
104    let mut html = Vec::new();
105
106    if let Some(payload) = &message.payload {
107        collect_part_candidates(payload, "root", &mut plain, &mut html)?;
108    }
109
110    if plain.is_empty() {
111        plain = html;
112    }
113
114    if plain.is_empty()
115        && let Some(snippet) = message
116            .snippet
117            .as_ref()
118            .filter(|value| !value.trim().is_empty())
119    {
120        plain.push(MessageBodyCandidate {
121            mime_type: "snippet".to_owned(),
122            text: snippet.clone(),
123        });
124    }
125
126    Ok(plain)
127}
128
129fn collect_part_candidates(
130    part: &GmailMessagePart,
131    fallback_id: &str,
132    plain: &mut Vec<MessageBodyCandidate>,
133    html: &mut Vec<MessageBodyCandidate>,
134) -> GmailCodeResult<()> {
135    let part_id = part.part_id.as_deref().unwrap_or(fallback_id);
136    if let Some(data) = part.body.as_ref().and_then(|body| body.data.as_deref()) {
137        let decoded = decode_gmail_body(data, part_id)?;
138        let candidate = MessageBodyCandidate {
139            mime_type: part.mime_type.clone(),
140            text: decoded,
141        };
142        match part.mime_type.as_str() {
143            "text/plain" => plain.push(candidate),
144            "text/html" => html.push(candidate),
145            _ => {}
146        }
147    }
148
149    for (index, child) in part.parts.iter().enumerate() {
150        let child_id = child
151            .part_id
152            .as_deref()
153            .map(Cow::Borrowed)
154            .unwrap_or_else(|| Cow::Owned(format!("{part_id}.{index}")));
155        collect_part_candidates(child, &child_id, plain, html)?;
156    }
157
158    Ok(())
159}
160
161fn decode_gmail_body(data: &str, part_id: &str) -> GmailCodeResult<String> {
162    let mut normalized = data.trim().to_owned();
163    while !normalized.len().is_multiple_of(4) {
164        normalized.push('=');
165    }
166
167    let bytes =
168        URL_SAFE
169            .decode(normalized.as_bytes())
170            .map_err(|source| GmailCodeError::BodyDecode {
171                part_id: part_id.to_owned(),
172                source,
173            })?;
174    String::from_utf8(bytes).map_err(|source| GmailCodeError::BodyUtf8 {
175        part_id: part_id.to_owned(),
176        source,
177    })
178}
179
180fn html_to_searchable_text(content: &str) -> String {
181    let without_tags = strip_html_tags(content);
182    decode_basic_entities(&without_tags)
183}
184
185fn strip_html_tags(content: &str) -> String {
186    let mut output = String::with_capacity(content.len());
187    let mut inside_tag = false;
188
189    for ch in content.chars() {
190        match ch {
191            '<' => {
192                inside_tag = true;
193                output.push(' ');
194            }
195            '>' => {
196                inside_tag = false;
197                output.push(' ');
198            }
199            _ if !inside_tag => output.push(ch),
200            _ => {}
201        }
202    }
203
204    output
205}
206
207fn decode_basic_entities(content: &str) -> String {
208    content
209        .replace("&nbsp;", " ")
210        .replace("&amp;", "&")
211        .replace("&lt;", "<")
212        .replace("&gt;", ">")
213        .replace("&#x2F;", "/")
214        .replace("&#47;", "/")
215}
216
217fn normalize_code(raw: &str) -> String {
218    raw.chars().filter(char::is_ascii_digit).collect()
219}
220
221fn looks_like_false_positive(code: &str) -> bool {
222    matches!(code, "177010") || code.chars().all(|ch| ch == '0')
223}
224
225#[cfg(test)]
226mod tests {
227    use super::{
228        ExtractCodeOptions, collect_message_body_candidates, extract_verification_code,
229        extract_verification_code_with_options,
230    };
231    use crate::{GmailMessage, GmailMessagePart};
232    use base64::Engine;
233    use base64::engine::general_purpose::URL_SAFE_NO_PAD;
234
235    #[test]
236    fn extracts_explicit_english_code() {
237        assert_eq!(
238            extract_verification_code("Your verification code: 123456").as_deref(),
239            Some("123456")
240        );
241    }
242
243    #[test]
244    fn extracts_chinese_verification_code() {
245        assert_eq!(
246            extract_verification_code("验证码: 876 543").as_deref(),
247            Some("876543")
248        );
249    }
250
251    #[test]
252    fn ignores_hex_color_before_real_code() {
253        let content = "style=\"color:#123456\" code is 778899";
254
255        assert_eq!(
256            extract_verification_code(content).as_deref(),
257            Some("778899")
258        );
259    }
260
261    #[test]
262    fn supports_custom_digit_length() {
263        let options = ExtractCodeOptions::new(9, 10);
264
265        assert_eq!(
266            extract_verification_code_with_options("OTP: 1234567890", options).as_deref(),
267            Some("1234567890")
268        );
269    }
270
271    #[test]
272    fn body_candidates_prefer_plain_text_over_html() {
273        let message = GmailMessage {
274            id: "m1".to_owned(),
275            thread_id: None,
276            snippet: None,
277            payload: Some(GmailMessagePart {
278                part_id: Some("root".to_owned()),
279                mime_type: "multipart/alternative".to_owned(),
280                headers: Vec::new(),
281                body: None,
282                parts: vec![
283                    GmailMessagePart {
284                        part_id: Some("1".to_owned()),
285                        mime_type: "text/html".to_owned(),
286                        headers: Vec::new(),
287                        body: Some(crate::model::GmailMessagePartBody {
288                            data: Some(encode("<b>999999</b>")),
289                            size: None,
290                            attachment_id: None,
291                        }),
292                        parts: Vec::new(),
293                    },
294                    GmailMessagePart {
295                        part_id: Some("2".to_owned()),
296                        mime_type: "text/plain".to_owned(),
297                        headers: Vec::new(),
298                        body: Some(crate::model::GmailMessagePartBody {
299                            data: Some(encode("Your code is 123456")),
300                            size: None,
301                            attachment_id: None,
302                        }),
303                        parts: Vec::new(),
304                    },
305                ],
306            }),
307            extra: Default::default(),
308        };
309
310        let candidates = collect_message_body_candidates(&message).expect("candidates");
311
312        assert_eq!(candidates.len(), 1);
313        assert_eq!(candidates[0].mime_type, "text/plain");
314        assert!(candidates[0].text.contains("123456"));
315    }
316
317    fn encode(value: &str) -> String {
318        URL_SAFE_NO_PAD.encode(value.as_bytes())
319    }
320}