use base64::Engine as _;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use encoding_rs::Encoding;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum RuntimeTextEncoding {
Utf8,
System,
Oem,
Gbk,
Gb18030,
Latin1,
Base64,
}
pub(crate) struct DecodedRuntimeText {
pub(crate) text: String,
pub(crate) encoding: String,
pub(crate) lossy: bool,
pub(crate) base64: Option<String>,
}
impl RuntimeTextEncoding {
pub(crate) fn parse(label: &str) -> Result<Self, String> {
let normalized_label = label.trim().to_ascii_lowercase();
match normalized_label.as_str() {
"utf-8" | "utf8" => Ok(Self::Utf8),
"system" | "ansi" | "acp" => Ok(Self::System),
"oem" | "console" => Ok(Self::Oem),
"gbk" | "cp936" | "windows-936" => Ok(Self::Gbk),
"gb18030" => Ok(Self::Gb18030),
"latin1" | "latin-1" | "iso-8859-1" => Ok(Self::Latin1),
"bytes" | "base64" => Ok(Self::Base64),
_ => Err(format!("unsupported text encoding `{label}`")),
}
}
pub(crate) fn requested_label(self) -> &'static str {
match self {
Self::Utf8 => "utf-8",
Self::System => "system",
Self::Oem => "oem",
Self::Gbk => "gbk",
Self::Gb18030 => "gb18030",
Self::Latin1 => "latin1",
Self::Base64 => "base64",
}
}
}
pub(crate) fn default_runtime_text_encoding() -> RuntimeTextEncoding {
#[cfg(windows)]
{
RuntimeTextEncoding::System
}
#[cfg(not(windows))]
{
RuntimeTextEncoding::Utf8
}
}
pub(crate) fn decode_runtime_text(
bytes: &[u8],
encoding: RuntimeTextEncoding,
) -> DecodedRuntimeText {
match encoding {
RuntimeTextEncoding::Utf8 => decode_utf8(bytes),
RuntimeTextEncoding::System => decode_system_text(bytes),
RuntimeTextEncoding::Oem => decode_oem_text(bytes),
RuntimeTextEncoding::Gbk => decode_with_encoding_rs(bytes, "gbk", b"gbk"),
RuntimeTextEncoding::Gb18030 => decode_with_encoding_rs(bytes, "gb18030", b"gb18030"),
RuntimeTextEncoding::Latin1 => decode_latin1(bytes),
RuntimeTextEncoding::Base64 => decode_as_base64(bytes),
}
}
pub(crate) fn encode_runtime_text(
text: &str,
encoding: RuntimeTextEncoding,
) -> Result<Vec<u8>, String> {
match encoding {
RuntimeTextEncoding::Utf8 => Ok(text.as_bytes().to_vec()),
RuntimeTextEncoding::System => encode_system_text(text),
RuntimeTextEncoding::Oem => encode_oem_text(text),
RuntimeTextEncoding::Gbk => encode_with_encoding_rs(text, "gbk", b"gbk"),
RuntimeTextEncoding::Gb18030 => encode_with_encoding_rs(text, "gb18030", b"gb18030"),
RuntimeTextEncoding::Latin1 => Ok(encode_latin1(text)),
RuntimeTextEncoding::Base64 => BASE64_STANDARD
.decode(text.as_bytes())
.map_err(|error| format!("base64 decode failed: {error}")),
}
}
fn decode_utf8(bytes: &[u8]) -> DecodedRuntimeText {
match String::from_utf8(bytes.to_vec()) {
Ok(text) => DecodedRuntimeText {
text,
encoding: "utf-8".to_string(),
lossy: false,
base64: None,
},
Err(error) => DecodedRuntimeText {
text: String::from_utf8_lossy(error.as_bytes()).to_string(),
encoding: "utf-8".to_string(),
lossy: true,
base64: Some(BASE64_STANDARD.encode(error.as_bytes())),
},
}
}
fn decode_with_encoding_rs(
bytes: &[u8],
actual_label: &str,
lookup_label: &'static [u8],
) -> DecodedRuntimeText {
let encoding = Encoding::for_label(lookup_label).unwrap_or(encoding_rs::UTF_8);
let (text, _, had_errors) = encoding.decode(bytes);
DecodedRuntimeText {
text: text.into_owned(),
encoding: actual_label.to_string(),
lossy: had_errors,
base64: if had_errors {
Some(BASE64_STANDARD.encode(bytes))
} else {
None
},
}
}
fn encode_with_encoding_rs(
text: &str,
actual_label: &str,
lookup_label: &'static [u8],
) -> Result<Vec<u8>, String> {
let encoding = Encoding::for_label(lookup_label).unwrap_or(encoding_rs::UTF_8);
let (bytes, _, had_errors) = encoding.encode(text);
if had_errors {
return Err(format!("{actual_label} encode failed without replacement"));
}
Ok(bytes.into_owned())
}
fn decode_latin1(bytes: &[u8]) -> DecodedRuntimeText {
DecodedRuntimeText {
text: bytes.iter().map(|byte| char::from(*byte)).collect(),
encoding: "latin1".to_string(),
lossy: false,
base64: None,
}
}
fn encode_latin1(text: &str) -> Vec<u8> {
text.chars()
.map(|ch| if (ch as u32) <= 0xff { ch as u8 } else { b'?' })
.collect()
}
fn decode_as_base64(bytes: &[u8]) -> DecodedRuntimeText {
let encoded = BASE64_STANDARD.encode(bytes);
DecodedRuntimeText {
text: encoded.clone(),
encoding: "base64".to_string(),
lossy: false,
base64: Some(encoded),
}
}
#[cfg(windows)]
fn decode_system_text(bytes: &[u8]) -> DecodedRuntimeText {
let code_page = unsafe { windows_sys::Win32::Globalization::GetACP() };
decode_windows_code_page(bytes, code_page, &format!("windows-{code_page}"))
}
#[cfg(not(windows))]
fn decode_system_text(bytes: &[u8]) -> DecodedRuntimeText {
decode_utf8(bytes)
}
#[cfg(windows)]
fn decode_oem_text(bytes: &[u8]) -> DecodedRuntimeText {
let code_page = unsafe { windows_sys::Win32::Globalization::GetOEMCP() };
decode_windows_code_page(bytes, code_page, &format!("windows-oem-{code_page}"))
}
#[cfg(not(windows))]
fn decode_oem_text(bytes: &[u8]) -> DecodedRuntimeText {
decode_utf8(bytes)
}
#[cfg(windows)]
fn encode_system_text(text: &str) -> Result<Vec<u8>, String> {
let code_page = unsafe { windows_sys::Win32::Globalization::GetACP() };
encode_windows_code_page(text, code_page, &format!("windows-{code_page}"))
}
#[cfg(not(windows))]
fn encode_system_text(text: &str) -> Result<Vec<u8>, String> {
Ok(text.as_bytes().to_vec())
}
#[cfg(windows)]
fn encode_oem_text(text: &str) -> Result<Vec<u8>, String> {
let code_page = unsafe { windows_sys::Win32::Globalization::GetOEMCP() };
encode_windows_code_page(text, code_page, &format!("windows-oem-{code_page}"))
}
#[cfg(not(windows))]
fn encode_oem_text(text: &str) -> Result<Vec<u8>, String> {
Ok(text.as_bytes().to_vec())
}
#[cfg(windows)]
fn decode_windows_code_page(
bytes: &[u8],
code_page: u32,
actual_label: &str,
) -> DecodedRuntimeText {
if bytes.is_empty() {
return DecodedRuntimeText {
text: String::new(),
encoding: actual_label.to_string(),
lossy: false,
base64: None,
};
}
let byte_len = bytes.len().min(i32::MAX as usize) as i32;
let strict_len = unsafe {
windows_sys::Win32::Globalization::MultiByteToWideChar(
code_page,
windows_sys::Win32::Globalization::MB_ERR_INVALID_CHARS,
bytes.as_ptr(),
byte_len,
std::ptr::null_mut(),
0,
)
};
let (flags, lossy) = if strict_len > 0 {
(
windows_sys::Win32::Globalization::MB_ERR_INVALID_CHARS,
false,
)
} else {
(0, true)
};
let wide_len = if strict_len > 0 {
strict_len
} else {
unsafe {
windows_sys::Win32::Globalization::MultiByteToWideChar(
code_page,
0,
bytes.as_ptr(),
byte_len,
std::ptr::null_mut(),
0,
)
}
};
if wide_len <= 0 {
let fallback = String::from_utf8_lossy(bytes).to_string();
return DecodedRuntimeText {
text: fallback,
encoding: actual_label.to_string(),
lossy: true,
base64: Some(BASE64_STANDARD.encode(bytes)),
};
}
let mut wide = vec![0u16; wide_len as usize];
let written = unsafe {
windows_sys::Win32::Globalization::MultiByteToWideChar(
code_page,
flags,
bytes.as_ptr(),
byte_len,
wide.as_mut_ptr(),
wide_len,
)
};
if written <= 0 {
let fallback = String::from_utf8_lossy(bytes).to_string();
return DecodedRuntimeText {
text: fallback,
encoding: actual_label.to_string(),
lossy: true,
base64: Some(BASE64_STANDARD.encode(bytes)),
};
}
wide.truncate(written as usize);
DecodedRuntimeText {
text: String::from_utf16_lossy(&wide),
encoding: actual_label.to_string(),
lossy,
base64: if lossy {
Some(BASE64_STANDARD.encode(bytes))
} else {
None
},
}
}
#[cfg(windows)]
fn encode_windows_code_page(
text: &str,
code_page: u32,
actual_label: &str,
) -> Result<Vec<u8>, String> {
if text.is_empty() {
return Ok(Vec::new());
}
let wide: Vec<u16> = text.encode_utf16().collect();
let wide_len = wide.len().min(i32::MAX as usize) as i32;
let byte_len = unsafe {
windows_sys::Win32::Globalization::WideCharToMultiByte(
code_page,
0,
wide.as_ptr(),
wide_len,
std::ptr::null_mut(),
0,
std::ptr::null(),
std::ptr::null_mut(),
)
};
if byte_len <= 0 {
return Err(format!("{actual_label} encode failed"));
}
let mut bytes = vec![0u8; byte_len as usize];
let mut used_default_char = 0;
let written = unsafe {
windows_sys::Win32::Globalization::WideCharToMultiByte(
code_page,
0,
wide.as_ptr(),
wide_len,
bytes.as_mut_ptr(),
byte_len,
std::ptr::null(),
&mut used_default_char,
)
};
if written <= 0 {
return Err(format!("{actual_label} encode failed"));
}
if used_default_char != 0 {
return Err(format!("{actual_label} encode required replacement"));
}
bytes.truncate(written as usize);
Ok(bytes)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decode_utf8_reports_clean_text() {
let decoded = decode_runtime_text("hello".as_bytes(), RuntimeTextEncoding::Utf8);
assert_eq!(decoded.text, "hello");
assert_eq!(decoded.encoding, "utf-8");
assert!(!decoded.lossy);
assert!(decoded.base64.is_none());
}
#[test]
fn decode_invalid_utf8_keeps_base64_fallback() {
let decoded = decode_runtime_text(&[0xff, 0xfe], RuntimeTextEncoding::Utf8);
assert!(decoded.lossy);
assert!(decoded.base64.is_some());
}
#[test]
fn decode_gb18030_chinese_text() {
let bytes = encode_runtime_text("中文", RuntimeTextEncoding::Gb18030)
.expect("gb18030 encode should succeed");
let decoded = decode_runtime_text(&bytes, RuntimeTextEncoding::Gb18030);
assert_eq!(decoded.text, "中文");
assert!(!decoded.lossy);
}
#[test]
fn decode_latin1_preserves_byte_values() {
let decoded = decode_runtime_text(&[0x41, 0xe9], RuntimeTextEncoding::Latin1);
assert_eq!(decoded.text, "Aé");
assert!(!decoded.lossy);
}
#[test]
fn decode_base64_preserves_raw_bytes() {
let decoded = decode_runtime_text(&[0, 1, 2], RuntimeTextEncoding::Base64);
assert_eq!(decoded.text, "AAEC");
assert_eq!(decoded.base64.as_deref(), Some("AAEC"));
}
}