use alloc::string::String;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Codepage {
Ascii,
Cp437,
Latin1,
}
impl Codepage {
#[inline]
pub fn detect(bytes: &[u8]) -> Codepage {
Codepage::detect_from_fields(&[bytes])
}
pub fn detect_from_fields(fields: &[&[u8]]) -> Codepage {
let mut non_nul: usize = 0;
let mut high: usize = 0;
let mut c1_controls: usize = 0;
let mut shade_glyphs: usize = 0; let mut block_glyphs: usize = 0;
for field in fields {
for &b in *field {
if b == 0 {
continue; }
non_nul += 1;
if b >= 0x80 {
high += 1;
}
match b {
0x80..=0x9F => c1_controls += 1,
0xB0..=0xB2 => shade_glyphs += 1,
0xDB..=0xDF => block_glyphs += 1,
_ => {}
}
}
}
if non_nul == 0 || high == 0 {
return Codepage::Ascii;
}
if c1_controls > 0 || shade_glyphs > 0 || block_glyphs >= 3 || high * 2 > non_nul {
return Codepage::Cp437;
}
Codepage::Latin1
}
pub fn decode(self, bytes: &[u8]) -> String {
let mut out = String::with_capacity(bytes.len());
match self {
Codepage::Ascii | Codepage::Latin1 => {
for &b in bytes {
out.push(b as char);
}
}
Codepage::Cp437 => {
for &b in bytes {
if b < 0x80 {
out.push(b as char);
} else {
out.push(CP437_HIGH[(b - 0x80) as usize]);
}
}
}
}
out
}
pub fn decode_name(self, bytes: &[u8]) -> String {
let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
let s = self.decode(&bytes[..end]);
s.trim_end_matches(' ').into()
}
}
#[inline]
pub fn decode_autodetect(bytes: &[u8]) -> String {
Codepage::detect(bytes).decode_name(bytes)
}
#[rustfmt::skip]
static CP437_HIGH: [char; 128] = [
'Ç', 'ü', 'é', 'â', 'ä', 'à', 'å', 'ç', 'ê', 'ë', 'è', 'ï', 'î', 'ì', 'Ä', 'Å',
'É', 'æ', 'Æ', 'ô', 'ö', 'ò', 'û', 'ù', 'ÿ', 'Ö', 'Ü', '¢', '£', '¥', '₧', 'ƒ',
'á', 'í', 'ó', 'ú', 'ñ', 'Ñ', 'ª', 'º', '¿', '⌐', '¬', '½', '¼', '¡', '«', '»',
'░', '▒', '▓', '│', '┤', '╡', '╢', '╖', '╕', '╣', '║', '╗', '╝', '╜', '╛', '┐',
'└', '┴', '┬', '├', '─', '┼', '╞', '╟', '╚', '╔', '╩', '╦', '╠', '═', '╬', '╧',
'╨', '╤', '╥', '╙', '╘', '╒', '╓', '╫', '╪', '┘', '┌', '█', '▄', '▌', '▐', '▀',
'α', 'ß', 'Γ', 'π', 'Σ', 'σ', 'µ', 'τ', 'Φ', 'Θ', 'Ω', 'δ', '∞', 'φ', 'ε', '∩',
'≡', '±', '≥', '≤', '⌠', '⌡', '÷', '≈', '°', '∙', '·', '√', 'ⁿ', '²', '■', '\u{A0}',
];
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detect_ascii_only() {
assert_eq!(Codepage::detect(b""), Codepage::Ascii);
assert_eq!(Codepage::detect(b"Hello, world!"), Codepage::Ascii);
assert_eq!(Codepage::detect(&[0u8; 22]), Codepage::Ascii);
assert_eq!(
Codepage::detect(b"name\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"),
Codepage::Ascii
);
}
#[test]
fn detect_cp437_block_art() {
let guru: &[u8] = &[
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0xdc, 0x20, 0xdc, 0xdb, 0xdb, 0xdc, 0x20,
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0x00,
];
assert_eq!(Codepage::detect(guru), Codepage::Cp437);
assert_eq!(Codepage::detect(&[0xc4u8; 22]), Codepage::Cp437);
let cap_c_cedilla: &[u8] = b"\x80a Project";
assert_eq!(Codepage::detect(cap_c_cedilla), Codepage::Cp437);
}
#[test]
fn detect_latin1_accented_name() {
let s: &[u8] = b"Caf\xe9 au lait";
assert_eq!(Codepage::detect(s), Codepage::Latin1);
let s: &[u8] = b"Fran\xe7ois";
assert_eq!(Codepage::detect(s), Codepage::Latin1);
let s: &[u8] = b"\xc9l\xe9onore d'Acquit\xe9";
assert_eq!(Codepage::detect(s), Codepage::Latin1);
}
#[test]
fn detect_pooled_fields_more_reliable() {
let title: &[u8] = b"My Great Demo Soundtrack";
let name1: &[u8] = b"Fran\xe7ois";
let name2: &[u8] = b"";
let cp = Codepage::detect_from_fields(&[title, name1, name2]);
assert_eq!(cp, Codepage::Latin1);
let block_art: &[u8] = &[0xdbu8; 22];
let cp = Codepage::detect_from_fields(&[title, block_art, block_art]);
assert_eq!(cp, Codepage::Cp437);
}
#[test]
fn detect_pooled_guru_mod_layout() {
let title: &[u8] = b"guru";
let art0: &[u8] = &[
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0xdc, 0x20, 0xdc, 0xdb, 0xdb, 0xdc, 0x20,
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb,
];
let art1: &[u8] = &[0xdbu8; 19];
let art2: &[u8] = &[
0xdf, 0xdb, 0xdb, 0xdb, 0xdc, 0xdc, 0xdc, 0x20, 0x20, 0xdb, 0xdb, 0xdb, 0xdb, 0x20,
0xdb, 0xdb, 0xdb, 0xdb,
];
let line: &[u8] = &[0xc4u8; 21];
let composed: &[u8] = b"Composed and performe";
let by: &[u8] = b" by Scorpik of";
let contact: &[u8] = b"Contact:";
let address: &[u8] = b" 54-130 Wroclaw 62";
let cp = Codepage::detect_from_fields(&[
title, art0, art1, art2, line, composed, by, contact, address,
]);
assert_eq!(cp, Codepage::Cp437);
}
#[test]
fn detect_does_not_misclassify_german_eszett() {
let s: &[u8] = b"Gro\xdfmann";
assert_eq!(Codepage::detect(s), Codepage::Latin1);
let a: &[u8] = b"Stra\xdfe";
let b: &[u8] = b"Gr\xf6\xdfe";
assert_eq!(Codepage::detect_from_fields(&[a, b]), Codepage::Latin1);
let s: &[u8] = b"M\xdcLLER";
assert_eq!(Codepage::detect(s), Codepage::Latin1);
}
#[test]
fn decode_ascii_is_identity() {
assert_eq!(Codepage::Ascii.decode(b"hello"), "hello");
assert_eq!(Codepage::Latin1.decode(b"hello"), "hello");
assert_eq!(Codepage::Cp437.decode(b"hello"), "hello");
}
#[test]
fn decode_cp437_box_glyphs() {
assert_eq!(Codepage::Cp437.decode(&[0xdb]), "█");
assert_eq!(Codepage::Cp437.decode(&[0xdc]), "▄");
assert_eq!(Codepage::Cp437.decode(&[0xdf]), "▀");
assert_eq!(Codepage::Cp437.decode(&[0xc4]), "─");
assert_eq!(Codepage::Cp437.decode(&[0xcd]), "═");
let guru: &[u8] = &[
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0xdc, 0x20, 0xdc, 0xdb, 0xdb, 0xdc, 0x20,
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb,
];
let s = Codepage::Cp437.decode(guru);
assert!(s.starts_with('▄'));
assert!(s.contains('▀'));
assert!(s.contains('█'));
assert!(s.contains(' '));
}
#[test]
fn decode_latin1_accents() {
assert_eq!(Codepage::Latin1.decode(b"Caf\xe9"), "Café");
assert_eq!(Codepage::Latin1.decode(b"Fran\xe7ois"), "François");
assert_eq!(Codepage::Latin1.decode(b"\xc4rger"), "Ärger");
}
#[test]
fn decode_cp437_high_letters() {
assert_eq!(Codepage::Cp437.decode(b"Caf\x82"), "Café");
assert_eq!(Codepage::Cp437.decode(&[0xe1]), "ß");
}
#[test]
fn decode_name_strips_nul_and_trailing_space() {
let bytes: &[u8] = b" Lead\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
assert_eq!(Codepage::Ascii.decode_name(bytes), " Lead");
let bytes: &[u8] = b"Lead ";
assert_eq!(Codepage::Ascii.decode_name(bytes), "Lead");
let bytes: &[u8] = b"Lead \0\0\0";
assert_eq!(Codepage::Ascii.decode_name(bytes), "Lead");
let bytes: &[u8] = b" POLAND\0\0\0\0\0\0\0\0\0";
assert_eq!(Codepage::Ascii.decode_name(bytes), " POLAND");
let bytes: &[u8] = &[0xdb, 0xdb, 0xdb, 0x00, 0x00];
let s = Codepage::Cp437.decode_name(bytes);
assert_eq!(s, "███");
}
#[test]
fn autodetect_round_trip_guru() {
let bytes: &[u8] = &[
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0xdc, 0x20, 0xdc, 0xdb, 0xdb, 0xdc, 0x20,
0xdc, 0xdb, 0xdb, 0xdb, 0xdf, 0xdb, 0xdb, 0x00,
];
let s = decode_autodetect(bytes);
assert!(!s.contains('\u{FFFD}'));
assert!(!s.contains('\0'));
assert_eq!(s, s.trim());
}
#[test]
fn cp437_table_has_no_replacement_chars() {
for (i, &c) in CP437_HIGH.iter().enumerate() {
assert_ne!(c, '\u{FFFD}', "entry 0x{:02x} is U+FFFD", 0x80 + i);
assert_ne!(c, '\0', "entry 0x{:02x} is NUL", 0x80 + i);
}
}
}