plsfix 0.1.8 - Docs.rs

/*
`plsfix::bad_codecs::sloppy` provides character-map encodings that fill their "holes"
in a messy but common way: by outputting the Unicode codepoints with the same
numbers.

This is incredibly ugly, and it's also in the HTML5 standard.

A single-byte encoding maps each byte to a Unicode character, except that some
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
example, bytes 0x81 and 0x8D, among others, have no meaning.

Rust, wanting to preserve some sense of decorum, will handle these bytes
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
different from each other. It just hasn't defined what they are in terms of
Unicode.

Software that has to interoperate with Windows-1252 and Unicode -- such as all
the common Web browsers -- will pick some Unicode characters for them to map
to, and the characters they pick are the Unicode characters with the same
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
resulting characters tend to fall into a range of Unicode that's set aside for
obsolete Latin-1 control characters anyway.

These sloppy codecs do the same thing, thus interoperating with
other software that works this way. It defines a sloppy version of many
single-byte encodings with holes. (There is no need for a sloppy version of
an encoding without holes: for example, there is no such thing as
sloppy-iso-8859-2 or sloppy-macroman.)

The following encodings will become defined:

- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
- sloppy-windows-1251 (Cyrillic)
- sloppy-windows-1252 (Western European, based on Latin-1)
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
- sloppy-windows-1256 (Arabic)
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
- sloppy-windows-1258 (Vietnamese)
- sloppy-cp874 (Thai, based on ISO-8859-11)
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
- sloppy-iso-8859-6 (different Arabic)
- sloppy-iso-8859-7 (Greek)
- sloppy-iso-8859-8 (Hebrew)
- sloppy-iso-8859-11 (Thai)

Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
defined.

Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)
are used within plsfix.
*/
use encoding::Encoding as EncodingTrait;
use encoding::{all::ASCII as ASCIIBase, DecoderTrap};
use encoding_rs::{
    mem::decode_latin1, Encoding, ISO_8859_3, ISO_8859_6, ISO_8859_7, ISO_8859_8, WINDOWS_1250,
    WINDOWS_1251, WINDOWS_1252 as WINDOWS_1252Base, WINDOWS_1253, WINDOWS_1254, WINDOWS_1255,
    WINDOWS_1256, WINDOWS_1257, WINDOWS_1258,
};
use oem_cp::code_table::DECODING_TABLE_CP437;
use oem_cp::code_table::ENCODING_TABLE_CP437;
use oem_cp::decode_string_complete_table;
use oem_cp::encode_string_checked;
use rustc_hash::FxHashMap;

static REPLACEMENT_CHAR: char = '\u{FFFD}';

#[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)]
pub enum CodecType {
    SloppyWindows1250,
    SloppyWindows1251,
    SloppyWindows1252,
    SloppyWindows1253,
    SloppyWindows1254,
    SloppyWindows1255,
    SloppyWindows1256,
    SloppyWindows1257,
    SloppyWindows1258,
    SloppyIso88593,
    SloppyIso88596,
    SloppyIso88597,
    SloppyIso88598,
    Latin1,
    Windows1252,
    Iso88592,
    MacRoman,
    Ascii,
    Utf8,
    Utf8Variant,
    Cp437,
}

pub trait Codec: Sync {
    fn name(&self) -> &'static str;
    fn codec_type(&self) -> CodecType;
    fn decode(&self, bytes: &[u8]) -> String;
    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str>;
}

pub struct SloppyCodec {
    name: &'static str,
    codec_type: CodecType,
    decoded_chars: Vec<char>,
    encoded_bytes: FxHashMap<char, u8>,
}

impl Codec for SloppyCodec {
    fn name(&self) -> &'static str {
        self.name
    }

    fn codec_type(&self) -> CodecType {
        self.codec_type
    }

    fn decode(&self, bytes: &[u8]) -> String {
        bytes
            .iter()
            .map(|&b| self.decoded_chars[b as usize])
            .collect()
    }

    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str> {
        Ok(string
            .chars()
            .map(|c| *self.encoded_bytes.get(&c).unwrap_or(&0x1A)) // if there is no such character, we use the replacement char
            .collect())
    }
}

#[derive(Eq, Hash, PartialEq)]
pub struct StandardCodec {
    name: &'static str,
    codec_type: CodecType,
    encoding: &'static Encoding,
}

impl Codec for StandardCodec {
    fn name(&self) -> &'static str {
        self.name
    }

    fn codec_type(&self) -> CodecType {
        self.codec_type
    }

    fn decode(&self, bytes: &[u8]) -> String {
        self.encoding.decode(bytes).0.into_owned()
    }

    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str> {
        Ok(self.encoding.encode(string).0.into_owned())
    }
}

pub struct AsciiCodec {
    name: &'static str,
    codec_type: CodecType,
}

impl Codec for AsciiCodec {
    fn name(&self) -> &'static str {
        self.name
    }

    fn codec_type(&self) -> CodecType {
        self.codec_type
    }

    fn decode(&self, bytes: &[u8]) -> String {
        ASCIIBase.decode(bytes, DecoderTrap::Ignore).unwrap()
    }

    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str> {
        ASCIIBase
            .encode(string, encoding::EncoderTrap::Ignore)
            .map_err(|_| "Error encoding ASCII")
    }
}

pub struct Cp437Codec {
    name: &'static str,
    codec_type: CodecType,
}

impl Codec for Cp437Codec {
    fn name(&self) -> &'static str {
        self.name
    }

    fn codec_type(&self) -> CodecType {
        self.codec_type
    }

    fn decode(&self, bytes: &[u8]) -> String {
        // encode_string_lossy(bytes, &ENCODING_TABLE_CP437)
        decode_string_complete_table(bytes, &DECODING_TABLE_CP437)
    }

    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str> {
        encode_string_checked(string, &ENCODING_TABLE_CP437).ok_or("Character not in CP437")
    }
}

pub struct Latin1Codec {
    name: &'static str,
    codec_type: CodecType,
}

impl Codec for Latin1Codec {
    fn name(&self) -> &'static str {
        self.name
    }

    fn codec_type(&self) -> CodecType {
        self.codec_type
    }

    fn decode(&self, bytes: &[u8]) -> String {
        bytes.iter().map(|&c| c as char).collect()
    }

    fn encode(&self, string: &str) -> Result<Vec<u8>, &'static str> {
        string
            .chars()
            .map(|c| {
                let char_code = c as u32;
                if char_code <= 0xFF {
                    Ok(char_code as u8)
                } else {
                    Err("Character out of latin1 range encountered")
                }
            })
            .collect()
    }
}

fn make_sloppy_codec(
    name: &'static str,
    codec_type: CodecType,
    base_encoding: &'static Encoding,
) -> SloppyCodec {
    /*
    Take a codec name, and return a 'sloppy' version of that codec that can
    encode and decode the unassigned bytes in that encoding.

    Single-byte encodings in the standard library are defined using some
    boilerplate classes surrounding the functions that do the actual work,
    `codecs.charmap_decode` and `charmap_encode`. This function, given an
    encoding name, *defines* those boilerplate classes.
    */
    let all_bytes: Vec<u8> = (0..=255).collect();

    // Get a list of what each byte would decode to in Latin-1.
    let mut sloppy_chars: Vec<char> = decode_latin1(&all_bytes).chars().collect();
    let mut encoded_bytes: FxHashMap<char, u8> = FxHashMap::default();

    // Get a list of what they decode to in the given encoding. Use the
    // replacement character for unassigned bytes.
    for (&byte, decoded_char) in all_bytes.iter().zip(sloppy_chars.iter_mut()) {
        *decoded_char = base_encoding
            .decode(&[byte])
            .0
            .chars()
            .next()
            .unwrap_or(REPLACEMENT_CHAR);
        encoded_bytes.insert(*decoded_char, byte); // here we store the byte value of each char in a HashMap
    }

    // For our own purposes, we're going to allow byte 1A, the "Substitute"
    // control code, to encode the Unicode replacement character U+FFFD.
    sloppy_chars[0x1A] = REPLACEMENT_CHAR;

    SloppyCodec {
        name: &name,
        codec_type,
        decoded_chars: sloppy_chars,
        encoded_bytes,
    }
}

lazy_static! {
    pub static ref SLOPPY_WINDOWS_1250: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1250",
        CodecType::SloppyWindows1250,
        &WINDOWS_1250
    );
    pub static ref SLOPPY_WINDOWS_1251: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1251",
        CodecType::SloppyWindows1251,
        &WINDOWS_1251
    );
    pub static ref SLOPPY_WINDOWS_1252: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1252",
        CodecType::SloppyWindows1252,
        &WINDOWS_1252Base
    );
    pub static ref SLOPPY_WINDOWS_1253: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1253",
        CodecType::SloppyWindows1253,
        &WINDOWS_1253
    );
    pub static ref SLOPPY_WINDOWS_1254: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1254",
        CodecType::SloppyWindows1254,
        &WINDOWS_1254
    );
    pub static ref SLOPPY_WINDOWS_1255: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1255",
        CodecType::SloppyWindows1255,
        &WINDOWS_1255
    );
    pub static ref SLOPPY_WINDOWS_1256: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1256",
        CodecType::SloppyWindows1256,
        &WINDOWS_1256
    );
    pub static ref SLOPPY_WINDOWS_1257: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1257",
        CodecType::SloppyWindows1257,
        &WINDOWS_1257
    );
    pub static ref SLOPPY_WINDOWS_1258: SloppyCodec = make_sloppy_codec(
        "sloppy-windows-1258",
        CodecType::SloppyWindows1258,
        &WINDOWS_1258
    );
    pub static ref SLOPPY_ISO_8859_3: SloppyCodec =
        make_sloppy_codec("sloppy-iso-8859-3", CodecType::SloppyIso88593, &ISO_8859_3);
    pub static ref SLOPPY_ISO_8859_6: SloppyCodec =
        make_sloppy_codec("sloppy-iso-8859-6", CodecType::SloppyIso88596, &ISO_8859_6);
    pub static ref SLOPPY_ISO_8859_7: SloppyCodec =
        make_sloppy_codec("sloppy-iso-8859-7", CodecType::SloppyIso88597, &ISO_8859_7);
    pub static ref SLOPPY_ISO_8859_8: SloppyCodec =
        make_sloppy_codec("sloppy-iso-8859-8", CodecType::SloppyIso88598, &ISO_8859_8);
    pub static ref WINDOWS_1252: StandardCodec = StandardCodec {
        name: "windows-1252",
        codec_type: CodecType::Windows1252,
        encoding: &encoding_rs::WINDOWS_1252,
    };
    pub static ref ISO_8859_2: StandardCodec = StandardCodec {
        name: "iso-8859-2",
        codec_type: CodecType::Iso88592,
        encoding: &encoding_rs::ISO_8859_2,
    };
    pub static ref MACROMAN: StandardCodec = StandardCodec {
        name: "macroman",
        codec_type: CodecType::MacRoman,
        encoding: &encoding_rs::MACINTOSH,
    };
    pub static ref ASCII: AsciiCodec = AsciiCodec {
        name: "ascii",
        codec_type: CodecType::Ascii,
    };
    pub static ref LATIN_1: Latin1Codec = Latin1Codec {
        name: "latin-1",
        codec_type: CodecType::Latin1,
    };
    pub static ref CP437: Cp437Codec = Cp437Codec {
        name: "cp437",
        codec_type: CodecType::Cp437,
    };
}