use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, HashSet};
#[allow(clippy::too_many_lines)]
pub fn decode_rfc2231_params(params: &[(String, String)]) -> Vec<(String, String)> {
type ContinuationGroup = (String, usize, BTreeMap<u32, (String, bool)>);
let mut result: Vec<(String, String)> = Vec::with_capacity(params.len());
let mut continuations: Vec<ContinuationGroup> = Vec::new();
let mut rfc2231_decoded: HashSet<usize> = HashSet::new();
for (key, value) in params {
if let Some(classification) = classify_key(key) {
match classification {
KeyClass::StandaloneEncoded { base_name } => {
let decoded = decode_charset_value(value);
rfc2231_decoded.insert(result.len());
result.push((base_name, decoded));
}
KeyClass::Continuation {
base_name,
index,
encoded,
} => {
let lower = base_name.to_ascii_lowercase();
let group = continuations.iter_mut().find(|(name, _, _)| *name == lower);
if let Some((_, _, segments)) = group {
match segments.entry(index) {
Entry::Vacant(e) => {
e.insert((value.clone(), encoded));
}
Entry::Occupied(_) => {
tracing::warn!(
base_name = lower.as_str(),
index = index,
"RFC 2231 Section 3: duplicate continuation index {}, keeping first value",
index,
);
}
}
} else {
let insert_pos = result.len();
result.push((String::new(), String::new()));
let mut segments = BTreeMap::new();
segments.insert(index, (value.clone(), encoded));
continuations.push((lower, insert_pos, segments));
}
}
}
} else {
result.push((key.clone(), value.clone()));
}
}
for (lower_name, insert_pos, segments) in continuations {
let first_encoded = segments.get(&0).is_some_and(|(_, enc)| *enc);
let mut charset: Option<String> = None;
let mut raw_bytes = Vec::new();
let mut expected_idx: u32 = 0;
for (idx, (value, is_encoded)) in &segments {
if *idx > expected_idx {
tracing::warn!(
base_name = lower_name.as_str(),
expected = expected_idx,
actual = *idx,
"RFC 2231 Section 3: gap in continuation (expected index {}, found {}); \
truncating assembly at segment {}",
expected_idx,
idx,
expected_idx,
);
break;
}
expected_idx = idx + 1;
if *idx == 0 && first_encoded && *is_encoded {
let (cs, bytes) = split_charset_value(value);
charset = cs;
raw_bytes.extend_from_slice(&bytes);
} else if *is_encoded {
raw_bytes.extend_from_slice(&percent_decode(value));
} else {
raw_bytes.extend_from_slice(value.as_bytes());
}
}
let decoded = match &charset {
Some(cs) => decode_bytes_with_charset(cs, &raw_bytes),
None => String::from_utf8_lossy(&raw_bytes).into_owned(),
};
let original_base = find_original_base_name(params, &lower_name);
if charset.is_some() {
rfc2231_decoded.insert(insert_pos);
}
result[insert_pos] = (original_base, decoded);
}
let mut rfc2231_names: HashSet<String> = HashSet::new();
for &idx in &rfc2231_decoded {
if let Some((key, _)) = result.get(idx) {
rfc2231_names.insert(key.to_ascii_lowercase());
}
}
if !rfc2231_names.is_empty() {
let mut new_result: Vec<(String, String)> = Vec::with_capacity(result.len());
let mut new_decoded: HashSet<usize> = HashSet::new();
for (i, entry) in result.into_iter().enumerate() {
let dominated = !rfc2231_decoded.contains(&i)
&& rfc2231_names.contains(&entry.0.to_ascii_lowercase());
if !dominated {
if rfc2231_decoded.contains(&i) {
new_decoded.insert(new_result.len());
}
new_result.push(entry);
}
}
result = new_result;
rfc2231_decoded = new_decoded;
}
for (i, (_key, value)) in result.iter_mut().enumerate() {
if rfc2231_decoded.contains(&i) {
continue;
}
if value.contains("=?") && value.contains("?=") {
*value = crate::codec::decode::decode_rfc2047(value.as_bytes());
}
}
result
}
enum KeyClass {
StandaloneEncoded { base_name: String },
Continuation {
base_name: String,
index: u32,
encoded: bool,
},
}
fn classify_key(key: &str) -> Option<KeyClass> {
let star_pos = key.find('*')?;
let base_name = key[..star_pos].to_owned();
let suffix = &key[star_pos + 1..];
if suffix.is_empty() {
return Some(KeyClass::StandaloneEncoded { base_name });
}
let (digits, is_encoded) = if let Some(stripped) = suffix.strip_suffix('*') {
(stripped, true)
} else {
(suffix, false)
};
if digits.len() > 1 && digits.starts_with('0') {
return None;
}
let index: u32 = digits.parse().ok()?;
Some(KeyClass::Continuation {
base_name,
index,
encoded: is_encoded,
})
}
fn decode_charset_value(value: &str) -> String {
let (charset, bytes) = split_charset_value(value);
match charset {
Some(cs) => decode_bytes_with_charset(&cs, &bytes),
None => String::from_utf8_lossy(&bytes).into_owned(),
}
}
fn split_charset_value(value: &str) -> (Option<String>, Vec<u8>) {
let Some(first_quote) = value.find('\'') else {
return (None, value.as_bytes().to_vec());
};
let Some(offset) = value[first_quote + 1..].find('\'') else {
return (None, value.as_bytes().to_vec());
};
let second_quote = first_quote + 1 + offset;
let charset = &value[..first_quote];
let encoded_part = &value[second_quote + 1..];
let bytes = percent_decode(encoded_part);
let cs = if charset.is_empty() {
None
} else {
Some(charset.to_owned())
};
(cs, bytes)
}
fn percent_decode(input: &str) -> Vec<u8> {
let bytes = input.as_bytes();
let mut result = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
if let (Some(hi), Some(lo)) = (hex_val(bytes[i + 1]), hex_val(bytes[i + 2])) {
result.push((hi << 4) | lo);
i += 3;
continue;
}
}
result.push(bytes[i]);
i += 1;
}
result
}
fn hex_val(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'A'..=b'F' => Some(b - b'A' + 10),
b'a'..=b'f' => Some(b - b'a' + 10),
_ => None,
}
}
fn decode_bytes_with_charset(charset: &str, bytes: &[u8]) -> String {
let cs_lower = charset.to_ascii_lowercase();
if cs_lower == "utf-8" || cs_lower == "utf8" {
return String::from_utf8_lossy(bytes).into_owned();
}
match encoding_rs::Encoding::for_label(charset.as_bytes()) {
Some(encoding) => {
let (cow, _) = encoding.decode_without_bom_handling(bytes);
cow.into_owned()
}
None => {
String::from_utf8_lossy(bytes).into_owned()
}
}
}
fn find_original_base_name(params: &[(String, String)], lower_name: &str) -> String {
for (key, _) in params {
if let Some(star_pos) = key.find('*') {
let base = &key[..star_pos];
if base.eq_ignore_ascii_case(lower_name) {
return base.to_owned();
}
}
}
lower_name.to_owned()
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
fn p(pairs: &[(&str, &str)]) -> Vec<(String, String)> {
pairs
.iter()
.map(|&(k, v)| (k.to_owned(), v.to_owned()))
.collect()
}
#[test]
fn plain_passthrough() {
let params = p(&[("charset", "utf-8"), ("name", "file.txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result, p(&[("charset", "utf-8"), ("name", "file.txt")]));
}
#[test]
fn standalone_charset_encoded() {
let params = p(&[("title*", "us-ascii'en-us'This%20is%20fun")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "This is fun");
}
#[test]
fn continuation_reassembly() {
let params = p(&[("name*0", "first"), ("name*1", "second")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "firstsecond");
}
#[test]
fn charset_continuation_combined() {
let params = p(&[
("title*0*", "us-ascii'en'This%20is"),
("title*1*", "%20fun"),
]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "This is fun");
}
#[test]
fn out_of_order_indices() {
let params = p(&[("name*1", "second"), ("name*0", "first")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "firstsecond");
}
#[test]
fn non_utf8_charset_iso8859_1() {
let params = p(&[("title*", "iso-8859-1'en'caf%E9")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "café");
}
#[test]
fn unknown_charset_lossy_fallback() {
let params = p(&[("title*", "x-nonexistent'en'hello%20world")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "hello world");
}
#[test]
fn mixed_plain_and_encoded_ordering() {
let params = p(&[
("charset", "utf-8"),
("name*0", "long"),
("name*1", "file.txt"),
("disposition", "inline"),
]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 3);
assert_eq!(result[0].0, "charset");
assert_eq!(result[0].1, "utf-8");
assert_eq!(result[1].0, "name");
assert_eq!(result[1].1, "longfile.txt");
assert_eq!(result[2].0, "disposition");
assert_eq!(result[2].1, "inline");
}
#[test]
fn empty_params() {
let params: Vec<(String, String)> = Vec::new();
let result = decode_rfc2231_params(¶ms);
assert!(result.is_empty());
}
#[test]
fn missing_language_tag() {
let params = p(&[("title*", "utf-8''hello%20world")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "hello world");
}
#[test]
fn malformed_value_no_quotes() {
let params = p(&[("title*", "just-some-value")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "just-some-value");
}
#[test]
fn case_insensitive_key_grouping() {
let params = p(&[("Name*0", "hello"), ("NAME*1", " world")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "Name");
assert_eq!(result[0].1, "hello world");
}
#[test]
fn percent_decode_basic() {
assert_eq!(percent_decode("hello%20world"), b"hello world");
assert_eq!(percent_decode("%2A%2A%2A"), b"***");
assert_eq!(percent_decode("no-encoding"), b"no-encoding");
}
#[test]
fn percent_decode_truncated_sequence() {
assert_eq!(percent_decode("abc%2"), b"abc%2");
assert_eq!(percent_decode("abc%"), b"abc%");
}
#[test]
fn percent_decode_invalid_hex() {
assert_eq!(percent_decode("%GG"), b"%GG");
}
#[test]
fn continuation_missing_segment_0() {
let params = p(&[("name*1", "world")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "");
}
#[test]
fn continuation_mixed_encoded_plain() {
let params = p(&[("name*0*", "utf-8'en'caf%C3%A9"), ("name*1", ".txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "café.txt");
}
#[test]
fn standalone_empty_charset_empty_language() {
let params = p(&[("title*", "''hello%20world")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "title");
assert_eq!(result[0].1, "hello world");
}
#[test]
fn empty_base_name_key() {
let params = p(&[("*0", "value")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "");
assert_eq!(result[0].1, "value");
}
#[test]
fn continuation_gap_truncates_at_first_gap() {
let params = p(&[("name*0", "first"), ("name*2", "third")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "first");
}
#[test]
fn continuation_gap_mid_sequence_truncates() {
let params = p(&[("f*0", "A"), ("f*1", "B"), ("f*3", "D")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].1, "AB");
}
#[test]
fn continuation_no_gap() {
let params = p(&[("f*0", "A"), ("f*1", "B"), ("f*2", "C")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].1, "ABC");
}
#[test]
fn rfc2047_encoded_word_fallback_base64() {
let params = p(&[("filename", "=?UTF-8?B?dGVzdC50eHQ=?=")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "filename");
assert_eq!(result[0].1, "test.txt");
}
#[test]
fn rfc2047_encoded_word_fallback_quoted_printable() {
let params = p(&[("filename", "=?UTF-8?Q?caf=C3=A9.txt?=")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "filename");
assert_eq!(result[0].1, "caf\u{e9}.txt");
}
#[test]
fn rfc2047_fallback_does_not_corrupt_plain_values() {
let params = p(&[("filename", "report.pdf")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result[0].1, "report.pdf");
}
#[test]
fn non_utf8_charset_preserves_leading_feff() {
let bom_bytes_hex = "%EF%BB%BF%C0"; let params = p(&[("title*", &format!("windows-1252''{bom_bytes_hex}"))]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].1, "À");
}
#[test]
fn rfc2231_decoded_value_not_double_decoded_as_rfc2047() {
let params = p(&[("name*", "utf-8''%3D%3FUTF-8%3FB%3FdGVzdA%3D%3D%3F%3D")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "=?UTF-8?B?dGVzdA==?=");
}
#[test]
fn duplicate_continuation_index_keeps_first() {
let params = p(&[("name*0", "correct"), ("name*0", "wrong")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "name");
assert_eq!(result[0].1, "correct"); }
#[test]
fn rfc2231_section5_encoded_overrides_plain_duplicate() {
let params = p(&[
("name", "fallback.txt"),
("name*", "utf-8''encoded%2Dname.txt"),
]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(
result,
p(&[("name", "encoded-name.txt")]),
"RFC 2231 Section 5: encoded form should override plain duplicate"
);
}
#[test]
fn rfc2231_section5_plain_without_encoded_kept() {
let params = p(&[("name", "plain.txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result, p(&[("name", "plain.txt")]));
}
#[test]
fn rfc2231_section5_encoded_without_plain_kept() {
let params = p(&[("name*", "utf-8''encoded.txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result, p(&[("name", "encoded.txt")]));
}
#[test]
fn rfc2231_section5_case_insensitive_dedup() {
let params = p(&[("Name", "fallback.txt"), ("NAME*", "utf-8''encoded.txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(
result.len(),
1,
"RFC 2231 Section 5: case-insensitive dedup should produce one entry; got {result:?}"
);
assert_eq!(result[0].1, "encoded.txt");
}
#[test]
fn rfc2231_section5_continuation_overrides_plain() {
let params = p(&[
("name", "fallback.txt"),
("name*0*", "utf-8''encoded"),
("name*1", ".txt"),
]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(
result.len(),
1,
"RFC 2231 Section 5: continuation group should override plain; got {result:?}"
);
assert_eq!(result[0].1, "encoded.txt");
}
#[test]
fn rfc2231_section5_plain_before_encoded_preserves_order() {
let params = p(&[
("charset", "utf-8"),
("name", "fallback.txt"),
("name*", "utf-8''real%2Dname.txt"),
("disposition", "attachment"),
]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 3);
assert_eq!(result[0], ("charset".to_owned(), "utf-8".to_owned()));
assert_eq!(
result[2],
("disposition".to_owned(), "attachment".to_owned())
);
let name_entries: Vec<_> = result.iter().filter(|(k, _)| k == "name").collect();
assert_eq!(name_entries.len(), 1);
assert_eq!(name_entries[0].1, "real-name.txt");
}
#[test]
fn split_charset_value_one_quote_only() {
let (cs, bytes) = split_charset_value("utf-8'hello");
assert!(cs.is_none());
assert_eq!(bytes, b"utf-8'hello");
}
#[test]
fn percent_decode_lowercase_hex() {
assert_eq!(percent_decode("%2a%2b%2c"), b"*+,");
assert_eq!(percent_decode("caf%c3%a9"), "café".as_bytes());
}
#[test]
fn find_original_base_name_fallback_to_lowercase() {
let params = vec![
("plain_key".to_owned(), "value".to_owned()),
("another".to_owned(), "value2".to_owned()),
];
let result = find_original_base_name(¶ms, "nonexistent");
assert_eq!(result, "nonexistent");
}
#[test]
fn find_original_base_name_no_star_keys() {
let params = vec![("charset".to_owned(), "utf-8".to_owned())];
let result = find_original_base_name(¶ms, "charset");
assert_eq!(result, "charset");
}
#[test]
fn standalone_charset_no_encoded_bytes() {
let params = p(&[("filename*", "us-ascii'en'plain-text-file.txt")]);
let result = decode_rfc2231_params(¶ms);
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "filename");
assert_eq!(result[0].1, "plain-text-file.txt");
}
#[test]
fn spec_audit_m6_leading_zeroes_in_continuation_rejected() {
let params = p(&[("name*00", "first"), ("name*01", "second")]);
let result = decode_rfc2231_params(¶ms);
let has_reassembled = result
.iter()
.any(|(k, v)| k == "name" && v == "firstsecond");
assert!(
!has_reassembled,
"Leading zeroes in continuation indices (*00, *01) should be rejected \
per RFC 2231 Section 3, not silently normalized; got {result:?}"
);
}
#[test]
fn spec_audit_m6_leading_zero_collision() {
let params = p(&[("name*0", "correct"), ("name*00", "wrong")]);
let result = decode_rfc2231_params(¶ms);
let name_value = result.iter().find(|(k, _)| k == "name");
assert!(
name_value.is_some(),
"Expected a 'name' parameter in the result"
);
assert_eq!(
name_value.unwrap().1,
"correct",
"name*0 (valid) should take precedence over name*00 (leading zero); \
got {result:?}"
);
}
}