use crate::sha1::Sha1;
use encoding_rs::Encoding;
pub const DIGEST_SPEC: &[(usize, usize)] = &[(20, 3), (60, 3)];
pub const HASH_SIZE: usize = 40;
#[derive(Clone, Debug)]
struct Part {
headers: Vec<(String, String)>,
body: Vec<u8>,
}
impl Part {
fn parse(bytes: &[u8]) -> Self {
let normalized = normalize_message_newlines(bytes);
let (header_bytes, body) = match find_subslice(&normalized, b"\n\n") {
Some(index) => (&normalized[..index], normalized[index + 2..].to_vec()),
None => {
let text = String::from_utf8_lossy(&normalized);
if text.lines().next().map(valid_header_line).unwrap_or(false) {
(normalized.as_slice(), Vec::new())
} else {
(&[][..], normalized)
}
}
};
let header_text = String::from_utf8_lossy(header_bytes);
let mut headers: Vec<(String, String)> = Vec::new();
let mut current: Option<usize> = None;
for line in header_text.lines() {
if line.starts_with(' ') || line.starts_with('\t') {
if let Some(index) = current {
headers[index].1.push(' ');
headers[index].1.push_str(line.trim());
}
continue;
}
let Some((name, value)) = line.split_once(':') else {
continue;
};
headers.push((name.trim().to_string(), value.trim().to_string()));
current = Some(headers.len() - 1);
}
Self { headers, body }
}
fn header(&self, name: &str) -> Option<&str> {
self.headers
.iter()
.find(|(key, _)| key.eq_ignore_ascii_case(name))
.map(|(_, value)| value.as_str())
}
fn content_type(&self) -> ContentType {
ContentType::parse(self.header("Content-Type").unwrap_or("text/plain"))
}
fn transfer_encoding(&self) -> &str {
self.header("Content-Transfer-Encoding")
.unwrap_or("7bit")
.trim()
}
}
fn normalize_message_newlines(bytes: &[u8]) -> Vec<u8> {
let mut normalized = Vec::with_capacity(bytes.len());
let mut index = 0;
while index < bytes.len() {
match bytes[index] {
b'\r' => {
normalized.push(b'\n');
index += 1;
if index < bytes.len() && bytes[index] == b'\n' {
index += 1;
}
}
byte => {
normalized.push(byte);
index += 1;
}
}
}
normalized
}
fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() {
return Some(0);
}
haystack
.windows(needle.len())
.position(|window| window == needle)
}
fn valid_header_line(line: &str) -> bool {
let Some((name, _)) = line.split_once(':') else {
return false;
};
!name.is_empty()
&& name
.chars()
.all(|ch| ch.is_ascii_alphanumeric() || ch == '-')
}
#[derive(Clone, Debug)]
struct ContentType {
main: String,
sub: String,
boundary: Option<String>,
charset: Option<String>,
}
impl ContentType {
fn parse(value: &str) -> Self {
let mut parts = value.split(';');
let media = parts.next().unwrap_or("text/plain").trim().to_lowercase();
let (main, sub) = media
.split_once('/')
.map(|(main, sub)| (main.trim().to_string(), sub.trim().to_string()))
.unwrap_or_else(|| ("text".to_string(), "plain".to_string()));
let mut boundary = None;
let mut charset = None;
for part in parts {
let Some((key, value)) = part.split_once('=') else {
continue;
};
let key = key.trim().to_lowercase();
let value = value.trim().trim_matches('"').replace('\0', "");
match key.as_str() {
"boundary" => boundary = Some(value),
"charset" => charset = Some(value),
_ => {}
}
}
Self {
main,
sub,
boundary,
charset,
}
}
}
pub fn digest_message(bytes: &[u8]) -> String {
digest_with_spec(bytes, DIGEST_SPEC)
}
pub fn digest_with_spec(bytes: &[u8], spec: &[(usize, usize)]) -> String {
let mut sha = Sha1::new();
for line in predigest_with_spec(bytes, spec) {
sha.update(line.as_bytes());
}
let value = sha.hexdigest();
debug_assert_eq!(value.len(), HASH_SIZE);
value
}
pub fn predigest_message(bytes: &[u8]) -> Vec<String> {
predigest_with_spec(bytes, DIGEST_SPEC)
}
pub fn predigest_with_spec(bytes: &[u8], spec: &[(usize, usize)]) -> Vec<String> {
let root = Part::parse(bytes);
let mut payloads = Vec::new();
digest_payloads(&root, &mut payloads);
let mut lines = Vec::new();
for payload in payloads {
for line in payload.lines() {
let normalized = normalize(line);
if should_handle_line(&normalized) {
lines.push(normalized);
}
}
}
if lines.len() <= 4 {
lines
} else {
let mut selected = Vec::new();
for (offset, length) in spec {
for i in 0..*length {
let index = offset * lines.len() / 100 + i;
if let Some(line) = lines.get(index) {
selected.push(line.clone());
}
}
}
selected
}
}
pub fn digest_mbox(bytes: &[u8]) -> Vec<String> {
split_mbox(bytes)
.into_iter()
.map(|message| digest_message(&message))
.collect()
}
pub fn split_mbox(bytes: &[u8]) -> Vec<Vec<u8>> {
let text = String::from_utf8_lossy(bytes)
.replace("\r\n", "\n")
.replace('\r', "\n");
let mut messages = Vec::new();
let mut current = Vec::new();
let mut seen_boundary = false;
for line in text.lines() {
if line.starts_with("From ") {
if seen_boundary && !current.is_empty() {
messages.push(current.join("\n").into_bytes());
current.clear();
}
seen_boundary = true;
continue;
}
current.push(line.to_string());
}
if !current.is_empty() || !seen_boundary {
messages.push(current.join("\n").into_bytes());
}
messages
}
fn digest_payloads(part: &Part, out: &mut Vec<String>) {
let content_type = part.content_type();
if content_type.main == "multipart" {
if let Some(boundary) = content_type.boundary {
for child in split_multipart(&part.body, &boundary) {
digest_payloads(&Part::parse(&child), out);
}
}
return;
}
if content_type.main == "text" {
let decoded_bytes = decode_transfer(&part.body, part.transfer_encoding());
let charset = content_type.charset.as_deref().unwrap_or("ascii");
let payload = decode_charset(&decoded_bytes, charset);
if content_type.sub == "html" {
out.push(normalize_html_part(&payload));
} else {
out.push(payload);
}
} else {
out.push(String::from_utf8_lossy(&part.body).to_string());
}
}
fn split_multipart(bytes: &[u8], boundary: &str) -> Vec<Vec<u8>> {
let text = String::from_utf8_lossy(bytes)
.replace("\r\n", "\n")
.replace('\r', "\n");
let marker = format!("--{}", boundary);
let closing = format!("--{}--", boundary);
let mut parts = Vec::new();
let mut current: Vec<String> = Vec::new();
let mut inside = false;
let mut closed = false;
for line in text.lines() {
if line.trim_end() == marker {
if inside && !current.is_empty() {
parts.push(current.join("\n").into_bytes());
current.clear();
}
inside = true;
continue;
}
if line.trim_end() == closing {
if inside && !current.is_empty() {
parts.push(current.join("\n").into_bytes());
}
closed = true;
break;
}
if inside {
current.push(line.to_string());
}
}
if inside && !closed && !current.is_empty() {
parts.push(current.join("\n").into_bytes());
}
parts
}
pub fn normalize(input: &str) -> String {
let without_nuls = input.replace('\0', "");
let mut out = String::new();
let mut token = String::new();
for ch in without_nuls.chars() {
if ch.is_whitespace() {
push_normalized_token(&mut out, &token);
token.clear();
} else {
token.push(ch);
}
}
push_normalized_token(&mut out, &token);
out.trim().to_string()
}
fn push_normalized_token(out: &mut String, token: &str) {
if token.is_empty() {
return;
}
if token.chars().count() >= 10 {
return;
}
if looks_like_email(token) || looks_like_url(token) {
return;
}
out.push_str(token);
}
fn looks_like_email(token: &str) -> bool {
let Some(index) = token.find('@') else {
return false;
};
index > 0 && index + 1 < token.len()
}
fn looks_like_url(token: &str) -> bool {
let Some(index) = token.find(':') else {
return false;
};
index > 0 && token[..index].chars().all(|ch| ch.is_ascii_alphabetic())
}
fn should_handle_line(line: &str) -> bool {
line.len() >= 8
}
pub fn normalize_html_part(input: &str) -> String {
let mut data = Vec::new();
let mut text = String::new();
let mut tag = String::new();
let mut in_tag = false;
let mut collect = true;
for ch in input.chars() {
match (in_tag, ch) {
(false, '<') => {
push_html_text(&mut data, &text, collect);
text.clear();
tag.clear();
in_tag = true;
}
(true, '>') => {
let name = tag_name(&tag);
if name == "script" || name == "style" {
collect = tag.trim_start().starts_with('/');
}
in_tag = false;
}
(true, _) => tag.push(ch),
(false, _) => text.push(ch),
}
}
push_html_text(&mut data, &text, collect);
data.join(" ")
}
fn push_html_text(data: &mut Vec<String>, text: &str, collect: bool) {
let text = text.trim();
if collect && !text.is_empty() {
data.push(html_unescape(text));
}
}
fn tag_name(tag: &str) -> String {
tag.trim_start_matches('/')
.split_whitespace()
.next()
.unwrap_or("")
.trim_matches('/')
.to_lowercase()
}
fn html_unescape(value: &str) -> String {
value
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
}
fn decode_transfer(bytes: &[u8], encoding: &str) -> Vec<u8> {
match encoding.trim().to_lowercase().as_str() {
"quoted-printable" | "quopri" => decode_quoted_printable(bytes),
"base64" => decode_base64(bytes),
_ => bytes.to_vec(),
}
}
fn decode_quoted_printable(bytes: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'=' {
if i + 1 < bytes.len() && (bytes[i + 1] == b'\n' || bytes[i + 1] == b'\r') {
i += 2;
if i < bytes.len() && bytes[i - 1] == b'\r' && bytes[i] == b'\n' {
i += 1;
}
continue;
}
if i + 2 < bytes.len()
&& let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2]))
{
out.push((hi << 4) | lo);
i += 3;
continue;
}
}
out.push(bytes[i]);
i += 1;
}
out
}
fn decode_base64(bytes: &[u8]) -> Vec<u8> {
let mut out = Vec::new();
let mut buffer = 0u32;
let mut bits = 0u8;
for byte in bytes.iter().copied().filter(|b| !b.is_ascii_whitespace()) {
if byte == b'=' {
break;
}
let Some(value) = base64_val(byte) else {
continue;
};
buffer = (buffer << 6) | value as u32;
bits += 6;
if bits >= 8 {
bits -= 8;
out.push(((buffer >> bits) & 0xff) as u8);
}
}
out
}
fn base64_val(byte: u8) -> Option<u8> {
match byte {
b'A'..=b'Z' => Some(byte - b'A'),
b'a'..=b'z' => Some(byte - b'a' + 26),
b'0'..=b'9' => Some(byte - b'0' + 52),
b'+' => Some(62),
b'/' => Some(63),
_ => None,
}
}
fn hex_val(byte: u8) -> Option<u8> {
match byte {
b'0'..=b'9' => Some(byte - b'0'),
b'a'..=b'f' => Some(byte - b'a' + 10),
b'A'..=b'F' => Some(byte - b'A' + 10),
_ => None,
}
}
fn decode_charset(bytes: &[u8], charset: &str) -> String {
let normalized = charset
.to_lowercase()
.replace('_', "-")
.replace(char::from(0), "");
match normalized.as_str() {
"ascii" | "us-ascii" => decode_ascii_ignore(bytes),
"utf8" | "utf-8" => decode_utf8_ignore(bytes),
"iso-8859-1" | "latin-1" | "latin1" => bytes.iter().map(|byte| *byte as char).collect(),
"quopri-codec" | "quopri" | "quoted-printable" | "quotedprintable" => {
decode_ascii_ignore(bytes)
}
_ => decode_registered_charset(bytes, &normalized)
.unwrap_or_else(|| decode_ascii_ignore(bytes)),
}
}
fn decode_registered_charset(bytes: &[u8], charset: &str) -> Option<String> {
Encoding::for_label(charset.as_bytes())
.and_then(|encoding| encoding.decode_without_bom_handling_and_without_replacement(bytes))
.map(|decoded| decoded.into_owned())
}
fn decode_ascii_ignore(bytes: &[u8]) -> String {
bytes
.iter()
.filter(|byte| byte.is_ascii())
.map(|byte| *byte as char)
.collect()
}
fn decode_utf8_ignore(mut bytes: &[u8]) -> String {
let mut out = String::new();
while !bytes.is_empty() {
match std::str::from_utf8(bytes) {
Ok(valid) => {
out.push_str(valid);
break;
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
out.push_str(std::str::from_utf8(&bytes[..valid_up_to]).unwrap());
}
let skip = error.error_len().unwrap_or(1);
bytes = &bytes[valid_up_to + skip..];
}
}
}
out
}
#[cfg(test)]
mod tests {
use super::{digest_message, normalize, normalize_html_part, predigest_message};
#[test]
fn digest_matches_python_simple() {
assert_eq!(
digest_message(b"That's some good ham right there"),
"0e01d5b816fe609f991576834db4da3c182bcef6"
);
}
#[test]
fn digest_removes_nulls() {
assert_eq!(
digest_message(b"That's some good ham rig\0ht there"),
"0e01d5b816fe609f991576834db4da3c182bcef6"
);
}
#[test]
fn predigest_atomic_and_pieced() {
assert_eq!(
predigest_message(b"All this message\nShould be included\nIn the predigest"),
vec!["Allthismessage", "Shouldbeincluded", "Inthepredigest"]
);
let mut msg = String::new();
for i in 0..100 {
msg.push_str(&format!("Line{} test test test\n", i));
}
assert_eq!(
predigest_message(msg.as_bytes()),
vec![
"Line20testtesttest",
"Line21testtesttest",
"Line22testtesttest",
"Line60testtesttest",
"Line61testtesttest",
"Line62testtesttest",
]
);
}
#[test]
fn normalizes_tokens() {
assert_eq!(normalize("Test test@example.com Test2"), "TestTest2");
assert_eq!(normalize("Test http://example.com Test2"), "TestTest2");
assert_eq!(normalize("Test 3sddkf9jdkd9 Test2"), "TestTest2");
}
#[test]
fn strips_html_script_and_style() {
assert_eq!(
normalize_html_part(
r#"<html><style>style</style><SCRIPT>script</SCRIPT><body>This is a test.</body></html>"#
),
"This is a test."
);
}
}