1use crate::{GmailCodeError, GmailCodeResult, GmailMessage, GmailMessagePart};
2use base64::Engine;
3use base64::engine::general_purpose::URL_SAFE;
4use regex::Regex;
5use std::borrow::Cow;
6use std::sync::LazyLock;
7
8static DEFAULT_CODE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
9 [
10 r#"class=["'](?:code|otp|verification-code)["'][^>]*>\s*([0-9][0-9\s-]{2,14}[0-9])\s*<"#,
11 r"(?i)(?:verification|security|login|one[-\s]?time|auth(?:entication)?|确认|验证|验证码|校验码)\s*(?:code|码)?[::\s]*(\d[\d\s-]{2,14}\d)",
12 r"(?i)(?:code|otp|pin)\s*(?:is|为|是)?[::\s]*(\d[\d\s-]{2,14}\d)",
13 r"(?:^|[^#&[:alnum:]])(\d[\d\s-]{2,14}\d)(?:[^[:alnum:]]|$)",
14 ]
15 .into_iter()
16 .map(|pattern| Regex::new(pattern).expect("verification-code regex should compile"))
17 .collect()
18});
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct ExtractCodeOptions {
23 pub min_digits: usize,
25 pub max_digits: usize,
27}
28
29impl ExtractCodeOptions {
30 #[must_use]
32 pub const fn new(min_digits: usize, max_digits: usize) -> Self {
33 let min_digits = if min_digits < 3 { 3 } else { min_digits };
34 let max_digits = if max_digits < min_digits {
35 min_digits
36 } else {
37 max_digits
38 };
39 Self {
40 min_digits,
41 max_digits,
42 }
43 }
44}
45
46impl Default for ExtractCodeOptions {
47 fn default() -> Self {
48 Self {
49 min_digits: 4,
50 max_digits: 8,
51 }
52 }
53}
54
55#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct MessageBodyCandidate {
58 pub mime_type: String,
60 pub text: String,
62}
63
64#[must_use]
66pub fn extract_verification_code(content: impl AsRef<str>) -> Option<String> {
67 extract_verification_code_with_options(content, ExtractCodeOptions::default())
68}
69
70#[must_use]
72pub fn extract_verification_code_with_options(
73 content: impl AsRef<str>,
74 options: ExtractCodeOptions,
75) -> Option<String> {
76 let content = content.as_ref();
77 if content.trim().is_empty() {
78 return None;
79 }
80
81 let normalized = html_to_searchable_text(content);
82 for pattern in DEFAULT_CODE_PATTERNS.iter() {
83 for capture in pattern.captures_iter(&normalized) {
84 let Some(raw) = capture.get(1).map(|code| code.as_str()) else {
85 continue;
86 };
87 let code = normalize_code(raw);
88 if code.len() >= options.min_digits
89 && code.len() <= options.max_digits
90 && !looks_like_false_positive(&code)
91 {
92 return Some(code);
93 }
94 }
95 }
96 None
97}
98
99pub fn collect_message_body_candidates(
101 message: &GmailMessage,
102) -> GmailCodeResult<Vec<MessageBodyCandidate>> {
103 let mut plain = Vec::new();
104 let mut html = Vec::new();
105
106 if let Some(payload) = &message.payload {
107 collect_part_candidates(payload, "root", &mut plain, &mut html)?;
108 }
109
110 if plain.is_empty() {
111 plain = html;
112 }
113
114 if plain.is_empty()
115 && let Some(snippet) = message
116 .snippet
117 .as_ref()
118 .filter(|value| !value.trim().is_empty())
119 {
120 plain.push(MessageBodyCandidate {
121 mime_type: "snippet".to_owned(),
122 text: snippet.clone(),
123 });
124 }
125
126 Ok(plain)
127}
128
129fn collect_part_candidates(
130 part: &GmailMessagePart,
131 fallback_id: &str,
132 plain: &mut Vec<MessageBodyCandidate>,
133 html: &mut Vec<MessageBodyCandidate>,
134) -> GmailCodeResult<()> {
135 let part_id = part.part_id.as_deref().unwrap_or(fallback_id);
136 if let Some(data) = part.body.as_ref().and_then(|body| body.data.as_deref()) {
137 let decoded = decode_gmail_body(data, part_id)?;
138 let candidate = MessageBodyCandidate {
139 mime_type: part.mime_type.clone(),
140 text: decoded,
141 };
142 match part.mime_type.as_str() {
143 "text/plain" => plain.push(candidate),
144 "text/html" => html.push(candidate),
145 _ => {}
146 }
147 }
148
149 for (index, child) in part.parts.iter().enumerate() {
150 let child_id = child
151 .part_id
152 .as_deref()
153 .map(Cow::Borrowed)
154 .unwrap_or_else(|| Cow::Owned(format!("{part_id}.{index}")));
155 collect_part_candidates(child, &child_id, plain, html)?;
156 }
157
158 Ok(())
159}
160
161fn decode_gmail_body(data: &str, part_id: &str) -> GmailCodeResult<String> {
162 let mut normalized = data.trim().to_owned();
163 while !normalized.len().is_multiple_of(4) {
164 normalized.push('=');
165 }
166
167 let bytes =
168 URL_SAFE
169 .decode(normalized.as_bytes())
170 .map_err(|source| GmailCodeError::BodyDecode {
171 part_id: part_id.to_owned(),
172 source,
173 })?;
174 String::from_utf8(bytes).map_err(|source| GmailCodeError::BodyUtf8 {
175 part_id: part_id.to_owned(),
176 source,
177 })
178}
179
180fn html_to_searchable_text(content: &str) -> String {
181 let without_tags = strip_html_tags(content);
182 decode_basic_entities(&without_tags)
183}
184
185fn strip_html_tags(content: &str) -> String {
186 let mut output = String::with_capacity(content.len());
187 let mut inside_tag = false;
188
189 for ch in content.chars() {
190 match ch {
191 '<' => {
192 inside_tag = true;
193 output.push(' ');
194 }
195 '>' => {
196 inside_tag = false;
197 output.push(' ');
198 }
199 _ if !inside_tag => output.push(ch),
200 _ => {}
201 }
202 }
203
204 output
205}
206
207fn decode_basic_entities(content: &str) -> String {
208 content
209 .replace(" ", " ")
210 .replace("&", "&")
211 .replace("<", "<")
212 .replace(">", ">")
213 .replace("/", "/")
214 .replace("/", "/")
215}
216
217fn normalize_code(raw: &str) -> String {
218 raw.chars().filter(char::is_ascii_digit).collect()
219}
220
221fn looks_like_false_positive(code: &str) -> bool {
222 matches!(code, "177010") || code.chars().all(|ch| ch == '0')
223}
224
225#[cfg(test)]
226mod tests {
227 use super::{
228 ExtractCodeOptions, collect_message_body_candidates, extract_verification_code,
229 extract_verification_code_with_options,
230 };
231 use crate::{GmailMessage, GmailMessagePart};
232 use base64::Engine;
233 use base64::engine::general_purpose::URL_SAFE_NO_PAD;
234
235 #[test]
236 fn extracts_explicit_english_code() {
237 assert_eq!(
238 extract_verification_code("Your verification code: 123456").as_deref(),
239 Some("123456")
240 );
241 }
242
243 #[test]
244 fn extracts_chinese_verification_code() {
245 assert_eq!(
246 extract_verification_code("验证码: 876 543").as_deref(),
247 Some("876543")
248 );
249 }
250
251 #[test]
252 fn ignores_hex_color_before_real_code() {
253 let content = "style=\"color:#123456\" code is 778899";
254
255 assert_eq!(
256 extract_verification_code(content).as_deref(),
257 Some("778899")
258 );
259 }
260
261 #[test]
262 fn supports_custom_digit_length() {
263 let options = ExtractCodeOptions::new(9, 10);
264
265 assert_eq!(
266 extract_verification_code_with_options("OTP: 1234567890", options).as_deref(),
267 Some("1234567890")
268 );
269 }
270
271 #[test]
272 fn body_candidates_prefer_plain_text_over_html() {
273 let message = GmailMessage {
274 id: "m1".to_owned(),
275 thread_id: None,
276 snippet: None,
277 payload: Some(GmailMessagePart {
278 part_id: Some("root".to_owned()),
279 mime_type: "multipart/alternative".to_owned(),
280 headers: Vec::new(),
281 body: None,
282 parts: vec![
283 GmailMessagePart {
284 part_id: Some("1".to_owned()),
285 mime_type: "text/html".to_owned(),
286 headers: Vec::new(),
287 body: Some(crate::model::GmailMessagePartBody {
288 data: Some(encode("<b>999999</b>")),
289 size: None,
290 attachment_id: None,
291 }),
292 parts: Vec::new(),
293 },
294 GmailMessagePart {
295 part_id: Some("2".to_owned()),
296 mime_type: "text/plain".to_owned(),
297 headers: Vec::new(),
298 body: Some(crate::model::GmailMessagePartBody {
299 data: Some(encode("Your code is 123456")),
300 size: None,
301 attachment_id: None,
302 }),
303 parts: Vec::new(),
304 },
305 ],
306 }),
307 extra: Default::default(),
308 };
309
310 let candidates = collect_message_body_candidates(&message).expect("candidates");
311
312 assert_eq!(candidates.len(), 1);
313 assert_eq!(candidates[0].mime_type, "text/plain");
314 assert!(candidates[0].text.contains("123456"));
315 }
316
317 fn encode(value: &str) -> String {
318 URL_SAFE_NO_PAD.encode(value.as_bytes())
319 }
320}