use std::collections::HashMap;
use base64::Engine;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::Deserialize;
use serde_json::Value;
use crate::types::{Address, LangmailError, ParsedInput, PreprocessOptions, ProcessedEmail};
pub fn preprocess_gmail(msg_json: &str) -> Result<ProcessedEmail, LangmailError> {
preprocess_gmail_with_options(msg_json, &PreprocessOptions::default())
}
pub fn preprocess_gmail_with_options(
msg_json: &str,
options: &PreprocessOptions,
) -> Result<ProcessedEmail, LangmailError> {
let value: Value = serde_json::from_str(msg_json)
.map_err(|e| LangmailError::InvalidGmailMessage(format!("invalid JSON: {}", e)))?;
let message_value = match value.get("data") {
Some(data) if data.is_object() && data.get("payload").is_some_and(|p| p.is_object()) => {
data
}
_ => &value,
};
if !message_value.is_object() {
return Err(LangmailError::InvalidGmailMessage(
"expected a Gmail message object".to_string(),
));
}
let message: GmailMessage = serde_json::from_value(message_value.clone()).map_err(|e| {
LangmailError::InvalidGmailMessage(format!("invalid Gmail message shape: {}", e))
})?;
let payload = message.payload.ok_or_else(|| {
LangmailError::InvalidGmailMessage(
"message.payload is missing. Did you fetch with format: \"full\"?".to_string(),
)
})?;
let (html, text) = extract_bodies(&payload)?;
let headers = normalize_headers(payload.headers.as_deref().unwrap_or(&[]));
let input = ParsedInput {
html,
text,
subject: headers.get("subject").cloned(),
from: headers.get("from").and_then(|v| parse_address(v)),
to: headers
.get("to")
.map(|v| parse_address_list(v))
.unwrap_or_default(),
cc: headers
.get("cc")
.map(|v| parse_address_list(v))
.unwrap_or_default(),
date: headers.get("date").and_then(|v| parse_rfc2822_date(v)),
rfc_message_id: headers.get("message-id").map(|v| strip_angle_brackets(v)),
in_reply_to: headers.get("in-reply-to").and_then(|v| parse_id_list(v)),
references: headers.get("references").and_then(|v| parse_id_list(v)),
};
Ok(crate::preprocess_parsed(input, options))
}
#[derive(Debug, Deserialize)]
struct GmailMessage {
payload: Option<GmailPart>,
}
#[derive(Debug, Deserialize)]
struct GmailPart {
#[serde(rename = "mimeType")]
mime_type: Option<String>,
#[serde(default)]
filename: Option<String>,
#[serde(default)]
headers: Option<Vec<GmailHeader>>,
#[serde(default)]
body: Option<GmailBody>,
#[serde(default)]
parts: Option<Vec<GmailPart>>,
}
#[derive(Debug, Deserialize)]
struct GmailHeader {
name: Option<String>,
value: Option<String>,
}
#[derive(Debug, Deserialize)]
struct GmailBody {
#[serde(default)]
data: Option<String>,
#[serde(default, rename = "attachmentId")]
attachment_id: Option<String>,
}
fn extract_bodies(payload: &GmailPart) -> Result<(Option<String>, Option<String>), LangmailError> {
let mut html: Option<String> = None;
let mut text: Option<String> = None;
visit_part(payload, &mut html, &mut text)?;
Ok((html, text))
}
fn visit_part(
part: &GmailPart,
html: &mut Option<String>,
text: &mut Option<String>,
) -> Result<(), LangmailError> {
let mime = part
.mime_type
.as_deref()
.map(|s| s.to_ascii_lowercase())
.unwrap_or_default();
let is_attachment = is_attachment_part(part);
let is_html_candidate = !is_attachment && mime == "text/html" && html.is_none();
let is_text_candidate = !is_attachment && mime == "text/plain" && text.is_none();
if is_html_candidate || is_text_candidate {
if let Some(body) = &part.body {
if let Some(data) = body.data.as_deref().filter(|s| !s.is_empty()) {
let decoded = decode_base64url(data);
if is_html_candidate {
*html = Some(decoded);
} else if is_text_candidate {
*text = Some(decoded);
}
} else if let Some(att_id) = body.attachment_id.as_deref().filter(|s| !s.is_empty()) {
return Err(LangmailError::BodyRequiresAttachmentFetch {
mime_type: mime.clone(),
attachment_id: att_id.to_string(),
});
}
}
}
if let Some(parts) = &part.parts {
for child in parts {
visit_part(child, html, text)?;
}
}
Ok(())
}
fn is_attachment_part(part: &GmailPart) -> bool {
if part.filename.as_deref().is_some_and(|s| !s.is_empty()) {
return true;
}
if let Some(headers) = &part.headers {
for h in headers {
if let (Some(name), Some(value)) = (h.name.as_deref(), h.value.as_deref()) {
if name.eq_ignore_ascii_case("content-disposition")
&& value
.trim_start()
.to_ascii_lowercase()
.starts_with("attachment")
{
return true;
}
}
}
}
false
}
fn decode_base64url(data: &str) -> String {
let cleaned: String = data.chars().filter(|c| !c.is_whitespace()).collect();
let trimmed = cleaned.trim_end_matches('=');
let bytes = base64::engine::general_purpose::URL_SAFE_NO_PAD
.decode(trimmed)
.unwrap_or_default();
String::from_utf8_lossy(&bytes).into_owned()
}
fn normalize_headers(headers: &[GmailHeader]) -> HashMap<String, String> {
let mut map = HashMap::with_capacity(headers.len());
for h in headers {
if let (Some(name), Some(value)) = (h.name.as_deref(), h.value.as_deref()) {
map.insert(name.to_ascii_lowercase(), value.to_string());
}
}
map
}
static ANGLE_ADDRESS: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^(?:"([^"]*)"|([^<]*?))\s*<([^>]+)>\s*$"#).unwrap());
static COMMENT_ADDRESS: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(\S+@\S+?)\s*\(([^)]*)\)\s*$").unwrap());
fn parse_address(raw: &str) -> Option<Address> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
if let Some(caps) = ANGLE_ADDRESS.captures(trimmed) {
let name = caps
.get(1)
.or_else(|| caps.get(2))
.map(|m| m.as_str().trim())
.unwrap_or("");
let email = caps
.get(3)
.map(|m| m.as_str().trim().to_string())
.unwrap_or_default();
return Some(Address {
name: if name.is_empty() {
None
} else {
Some(name.to_string())
},
email,
});
}
if let Some(caps) = COMMENT_ADDRESS.captures(trimmed) {
let email = caps
.get(1)
.map(|m| m.as_str().trim().to_string())
.unwrap_or_default();
let name = caps.get(2).map(|m| m.as_str().trim()).unwrap_or("");
return Some(Address {
name: if name.is_empty() {
None
} else {
Some(name.to_string())
},
email,
});
}
if trimmed.contains('@') && !trimmed.chars().any(char::is_whitespace) {
return Some(Address {
name: None,
email: trimmed.to_string(),
});
}
None
}
fn parse_address_list(raw: &str) -> Vec<Address> {
split_address_list(raw)
.into_iter()
.filter_map(parse_address)
.collect()
}
fn split_address_list(raw: &str) -> Vec<&str> {
let mut out = Vec::new();
let mut start = 0usize;
let mut in_quotes = false;
let mut angle_depth: i32 = 0;
let mut prev: char = '\0';
for (idx, ch) in raw.char_indices() {
match ch {
'"' if prev != '\\' => in_quotes = !in_quotes,
'<' if !in_quotes => angle_depth += 1,
'>' if !in_quotes && angle_depth > 0 => angle_depth -= 1,
',' if !in_quotes && angle_depth == 0 => {
let slice = &raw[start..idx];
if !slice.trim().is_empty() {
out.push(slice);
}
start = idx + ch.len_utf8();
}
_ => {}
}
prev = ch;
}
let tail = &raw[start..];
if !tail.trim().is_empty() {
out.push(tail);
}
out
}
fn parse_rfc2822_date(raw: &str) -> Option<String> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return None;
}
let after_dow = match trimmed.find(',') {
Some(i) if trimmed[..i].chars().all(|c| c.is_ascii_alphabetic()) => trimmed[i + 1..].trim(),
_ => trimmed,
};
let tokens: Vec<&str> = after_dow.split_ascii_whitespace().collect();
if tokens.len() < 5 {
return None;
}
let day: u32 = tokens[0].parse().ok()?;
let month: u32 = match tokens[1] {
"Jan" => 1,
"Feb" => 2,
"Mar" => 3,
"Apr" => 4,
"May" => 5,
"Jun" => 6,
"Jul" => 7,
"Aug" => 8,
"Sep" => 9,
"Oct" => 10,
"Nov" => 11,
"Dec" => 12,
_ => return None,
};
let year_raw: i32 = tokens[2].parse().ok()?;
let year: i32 = if year_raw < 50 {
year_raw + 2000
} else if year_raw < 100 {
year_raw + 1900
} else {
year_raw
};
let time_parts: Vec<&str> = tokens[3].split(':').collect();
if time_parts.len() < 2 {
return None;
}
let hour: u32 = time_parts[0].parse().ok()?;
let minute: u32 = time_parts[1].parse().ok()?;
let second: u32 = if time_parts.len() >= 3 {
time_parts[2].parse().ok()?
} else {
0
};
let tz_offset_minutes = parse_timezone(tokens[4])?;
let days = days_from_civil(year as i64, month as i64, day as i64);
let ts = days * 86_400 + hour as i64 * 3_600 + minute as i64 * 60 + second as i64
- tz_offset_minutes as i64 * 60;
Some(crate::timestamp_to_iso8601_utc(ts))
}
fn parse_timezone(tz: &str) -> Option<i32> {
match tz {
"GMT" | "UT" | "UTC" | "Z" => Some(0),
"EDT" => Some(-240),
"EST" | "CDT" => Some(-300),
"CST" | "MDT" => Some(-360),
"MST" | "PDT" => Some(-420),
"PST" => Some(-480),
_ => {
let (sign, digits) = if let Some(rest) = tz.strip_prefix('+') {
(1i32, rest)
} else if let Some(rest) = tz.strip_prefix('-') {
(-1i32, rest)
} else {
return None;
};
if digits.len() != 4 || !digits.chars().all(|c| c.is_ascii_digit()) {
return None;
}
let hh: i32 = digits[..2].parse().ok()?;
let mm: i32 = digits[2..].parse().ok()?;
Some(sign * (hh * 60 + mm))
}
}
}
fn days_from_civil(y: i64, m: i64, d: i64) -> i64 {
let y = if m <= 2 { y - 1 } else { y };
let era = (if y >= 0 { y } else { y - 399 }) / 400;
let yoe = y - era * 400; let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + d - 1; let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; era * 146_097 + doe - 719_468
}
fn strip_angle_brackets(s: &str) -> String {
let trimmed = s.trim();
if let Some(inner) = trimmed.strip_prefix('<') {
if let Some(inner) = inner.strip_suffix('>') {
if !inner.contains('>') {
return inner.to_string();
}
}
}
trimmed.to_string()
}
fn parse_id_list(raw: &str) -> Option<Vec<String>> {
let ids: Vec<String> = raw
.split_ascii_whitespace()
.map(strip_angle_brackets)
.filter(|s| s.contains('@'))
.collect();
if ids.is_empty() {
None
} else {
Some(ids)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_quoted_display_name() {
let a = parse_address(r#""Carol, Support" <carol@example.com>"#).unwrap();
assert_eq!(a.name.as_deref(), Some("Carol, Support"));
assert_eq!(a.email, "carol@example.com");
}
#[test]
fn parses_unquoted_display_name() {
let a = parse_address("Alice <alice@example.com>").unwrap();
assert_eq!(a.name.as_deref(), Some("Alice"));
assert_eq!(a.email, "alice@example.com");
}
#[test]
fn parses_comment_form() {
let a = parse_address("alice@example.com (Alice Example)").unwrap();
assert_eq!(a.name.as_deref(), Some("Alice Example"));
assert_eq!(a.email, "alice@example.com");
}
#[test]
fn parses_bare_email() {
let a = parse_address("alice@example.com").unwrap();
assert_eq!(a.name, None);
assert_eq!(a.email, "alice@example.com");
}
#[test]
fn parse_address_empty_returns_none() {
assert!(parse_address("").is_none());
assert!(parse_address(" ").is_none());
}
#[test]
fn splits_address_list_respecting_quotes() {
let list = parse_address_list(r#""Carol, Support" <carol@example.com>, dev@example.com"#);
assert_eq!(list.len(), 2);
assert_eq!(list[0].name.as_deref(), Some("Carol, Support"));
assert_eq!(list[1].email, "dev@example.com");
assert_eq!(list[1].name, None);
}
#[test]
fn rfc2822_date_positive_offset() {
let iso = parse_rfc2822_date("Thu, 05 Feb 2026 10:00:00 +0000").unwrap();
assert_eq!(iso, "2026-02-05T10:00:00Z");
}
#[test]
fn rfc2822_date_negative_offset_shifts_to_utc() {
let iso = parse_rfc2822_date("Mon, 12 Jan 2026 10:00:00 -0500").unwrap();
assert_eq!(iso, "2026-01-12T15:00:00Z");
}
#[test]
fn rfc2822_date_gmt_alias() {
let iso = parse_rfc2822_date("Thu, 05 Feb 2026 10:00:00 GMT").unwrap();
assert_eq!(iso, "2026-02-05T10:00:00Z");
}
#[test]
fn rfc2822_date_malformed_returns_none() {
assert!(parse_rfc2822_date("not a real date").is_none());
assert!(parse_rfc2822_date("").is_none());
}
#[test]
fn strip_angle_brackets_simple() {
assert_eq!(strip_angle_brackets("<abc@example.com>"), "abc@example.com");
assert_eq!(
strip_angle_brackets(" <abc@example.com> "),
"abc@example.com"
);
}
#[test]
fn strip_angle_brackets_no_wrap() {
assert_eq!(strip_angle_brackets("abc@example.com"), "abc@example.com");
}
#[test]
fn parse_id_list_splits_and_strips() {
let ids = parse_id_list("<root@example.com> <abc@example.com>").unwrap();
assert_eq!(ids, vec!["root@example.com", "abc@example.com"]);
}
#[test]
fn parse_id_list_drops_non_email_tokens() {
let ids =
parse_id_list("<root@example.com> (imported from archive) <abc@example.com>").unwrap();
assert_eq!(ids, vec!["root@example.com", "abc@example.com"]);
}
#[test]
fn parse_id_list_blank_is_none() {
assert!(parse_id_list(" ").is_none());
assert!(parse_id_list("").is_none());
}
#[test]
fn decode_base64url_roundtrip() {
assert_eq!(decode_base64url("SGVsbG8gd29ybGQ"), "Hello world");
}
#[test]
fn decode_base64url_url_safe_chars() {
let html = decode_base64url("PGh0bWw-PGJvZHk-PC9ib2R5PjwvaHRtbD4");
assert_eq!(html, "<html><body></body></html>");
}
#[test]
fn decode_base64url_non_utf8_bytes_lossy() {
let decoded = decode_base64url("_w");
assert_eq!(decoded, "\u{FFFD}");
assert!(!decoded.is_empty());
}
#[test]
fn decode_base64url_mixed_valid_and_invalid_utf8() {
let decoded = decode_base64url("SGn_");
assert!(decoded.starts_with("Hi"), "got: {:?}", decoded);
assert!(decoded.contains('\u{FFFD}'), "got: {:?}", decoded);
}
#[test]
fn rfc2822_date_empty_string_returns_none() {
assert!(parse_rfc2822_date("").is_none());
assert!(parse_rfc2822_date(" ").is_none());
}
#[test]
fn is_attachment_part_ignores_empty_filename_string() {
let part = GmailPart {
mime_type: Some("text/plain".to_string()),
filename: Some(String::new()),
headers: None,
body: None,
parts: None,
};
assert!(!is_attachment_part(&part));
}
#[test]
fn parse_address_rejects_bare_token_with_internal_whitespace() {
assert!(parse_address("hello @ world").is_none());
assert!(parse_address("a@b extra text").is_none());
}
}