use chrono::{DateTime, NaiveDate, Utc};
use serde_json::Value;
pub fn clean_body(raw: &str) -> String {
let stage1 = drop_reply_chain(raw);
let stage2 = drop_footer_noise(&stage1);
collapse_blank_runs(stage2.trim())
}
const FOOTER_TRIGGERS: &[&str] = &[
"unsubscribe",
"view in browser",
"view this email in your browser",
"view it in your browser",
"update your email settings",
"manage your subscription",
"manage preferences",
"email preferences",
"you are receiving this email because",
"you received this email because",
"you're receiving this email because",
"to stop receiving",
"all rights reserved",
"© 20",
"(c) 20",
"copyright 20",
"powered by mailchimp",
"sent via sendgrid",
"this email and any files",
"confidentiality notice",
"if you are not the intended recipient",
"this communication may contain",
];
pub fn drop_reply_chain(s: &str) -> String {
let mut offset = 0usize;
let mut quoted_run_start: Option<usize> = None;
let mut quoted_run_len = 0u32;
for line in s.split_inclusive('\n') {
let trimmed = line.trim();
let lower = trimmed.to_ascii_lowercase();
let is_preamble = (lower.starts_with("on ") && lower.contains(" wrote:"))
|| lower.contains("---------- forwarded message")
|| lower.contains("----- original message")
|| lower.contains("--------- original message")
|| lower.contains("--- forwarded by");
if is_preamble {
debug_assert!(s.is_char_boundary(offset));
return s[..offset].trim_end().to_string();
}
if trimmed.starts_with('>') {
if quoted_run_start.is_none() {
quoted_run_start = Some(offset);
quoted_run_len = 1;
} else {
quoted_run_len += 1;
}
if quoted_run_len >= 3 {
let cut = quoted_run_start.unwrap_or(offset);
debug_assert!(s.is_char_boundary(cut));
return s[..cut].trim_end().to_string();
}
} else if !trimmed.is_empty() {
quoted_run_start = None;
quoted_run_len = 0;
}
offset += line.len();
}
s.to_string()
}
pub fn drop_footer_noise(s: &str) -> String {
let mut offset = 0usize;
for line in s.split_inclusive('\n') {
let lower = line.to_ascii_lowercase();
if FOOTER_TRIGGERS.iter().any(|t| lower.contains(t)) {
debug_assert!(s.is_char_boundary(offset));
return s[..offset].trim_end().to_string();
}
offset += line.len();
}
s.to_string()
}
pub fn collapse_blank_runs(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut blank = 0u32;
for line in s.lines() {
if line.trim().is_empty() {
blank += 1;
if blank <= 1 {
out.push('\n');
}
} else {
blank = 0;
out.push_str(line);
out.push('\n');
}
}
while out.ends_with('\n') {
out.pop();
}
out
}
pub fn truncate_body(body: &str, max_chars: usize) -> String {
let trimmed = body.trim();
if trimmed.chars().count() <= max_chars {
return trimmed.to_string();
}
let mut out: String = trimmed.chars().take(max_chars).collect();
out.push('…');
out
}
pub fn md_escape(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for ch in s.chars() {
match ch {
'\\' | '`' | '*' | '_' | '|' => {
out.push('\\');
out.push(ch);
}
'\n' | '\r' => out.push(' '),
_ => out.push(ch),
}
}
out
}
pub fn extract_email(from: &str) -> Option<String> {
let s = from.trim();
if let (Some(start), Some(end)) = (s.rfind('<'), s.rfind('>')) {
if start < end {
debug_assert!(s.is_char_boundary(start + 1));
debug_assert!(s.is_char_boundary(end));
let inner = s[start + 1..end].trim();
if inner.contains('@') {
return Some(inner.to_string());
}
}
}
if s.contains('@') && !s.contains(' ') {
return Some(s.to_string());
}
None
}
fn strip_day_of_week_prefix(s: &str) -> Option<&str> {
const DAYS: &[&str] = &["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"];
let (prefix, rest) = s.split_once(", ")?;
if DAYS.iter().any(|d| d.eq_ignore_ascii_case(prefix)) {
Some(rest)
} else {
None
}
}
pub fn parse_message_date(m: &Value) -> Option<DateTime<Utc>> {
if let Some(dt) = m.get("date").and_then(parse_date_value) {
return Some(dt);
}
if let Some(dt) = m.get("internalDate").and_then(parse_date_value) {
return Some(dt);
}
m.get("data")
.and_then(|data| data.get("internalDate"))
.and_then(parse_date_value)
}
fn parse_date_value(raw: &Value) -> Option<DateTime<Utc>> {
if let Some(s) = raw.as_str() {
let s = s.trim();
if s.is_empty() {
return None;
}
if let Ok(ms) = s.parse::<i64>() {
return DateTime::from_timestamp_millis(ms);
}
if let Ok(dt) = DateTime::parse_from_rfc3339(s) {
return Some(dt.with_timezone(&Utc));
}
if let Ok(dt) = DateTime::parse_from_rfc2822(s) {
return Some(dt.with_timezone(&Utc));
}
if let Some(rest) = strip_day_of_week_prefix(s) {
if let Ok(dt) = DateTime::parse_from_str(rest, "%d %b %Y %H:%M:%S %z") {
return Some(dt.with_timezone(&Utc));
}
}
if let Ok(d) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
return d.and_hms_opt(0, 0, 0).map(|n| n.and_utc());
}
}
raw.as_i64().and_then(DateTime::from_timestamp_millis)
}
#[cfg(test)]
#[path = "email_clean_tests.rs"]
mod tests;