textcode 0.3.1

Text encoding/decoding library. Supports: UTF-8, ISO6937, ISO8859, GB2312
Documentation
use textcode::{
    Iso6937,
    decode,
    decode_to_slice,
    encode,
    encode_to_slice,
};

#[test]
fn test_iso6937() {
    struct TestData {
        encoded: &'static [u8],
        decoded: &'static str,
    }

    let tests = &[
        TestData {
            encoded: &[0x69, 0x73, 0x6f, 0x36, 0x39, 0x33, 0x37, 0x0a],
            decoded: "iso6937\n",
        },
        TestData {
            encoded: &[0xa4, 0xd5],
            decoded: "€♪",
        },
        TestData {
            encoded: &[
                0xc1, 0x41, 0xc1, 0x45, 0xc1, 0x49, 0xc1, 0x4f, 0xc1, 0x55, 0xc1, 0x61, 0xc1, 0x65,
                0xc1, 0x69, 0xc1, 0x6f, 0xc1, 0x75,
            ],
            decoded: "ÀÈÌÒÙàèìòù",
        },
        TestData {
            encoded: &[
                0xc2, 0x41, 0xc2, 0x43, 0xc2, 0x45, 0xc2, 0x49, 0xc2, 0x4c, 0xc2, 0x4e, 0xc2, 0x4f,
                0xc2, 0x52, 0xc2, 0x53, 0xc2, 0x55, 0xc2, 0x59, 0xc2, 0x5a, 0xc2, 0x61, 0xc2, 0x63,
                0xc2, 0x65, 0xc2, 0x67, 0xc2, 0x69, 0xc2, 0x6c, 0xc2, 0x6e, 0xc2, 0x6f, 0xc2, 0x72,
                0xc2, 0x73, 0xc2, 0x75, 0xc2, 0x79, 0xc2, 0x7a,
            ],
            decoded: "ÁĆÉÍĹŃÓŔŚÚÝŹáćéģíĺńóŕśúýź",
        },
        TestData {
            encoded: &[
                0xc3, 0x41, 0xc3, 0x43, 0xc3, 0x45, 0xc3, 0x47, 0xc3, 0x48, 0xc3, 0x49, 0xc3, 0x4a,
                0xc3, 0x4f, 0xc3, 0x53, 0xc3, 0x55, 0xc3, 0x57, 0xc3, 0x59, 0xc3, 0x61, 0xc3, 0x63,
                0xc3, 0x65, 0xc3, 0x67, 0xc3, 0x68, 0xc3, 0x69, 0xc3, 0x6a, 0xc3, 0x6f, 0xc3, 0x73,
                0xc3, 0x75, 0xc3, 0x77, 0xc3, 0x79,
            ],
            decoded: "ÂĈÊĜĤÎĴÔŜÛŴŶâĉêĝĥîĵôŝûŵŷ",
        },
        TestData {
            encoded: &[
                0xc4, 0x41, 0xc4, 0x49, 0xc4, 0x4e, 0xc4, 0x4f, 0xc4, 0x55, 0xc4, 0x61, 0xc4, 0x69,
                0xc4, 0x6e, 0xc4, 0x6f, 0xc4, 0x75,
            ],
            decoded: "ÃĨÑÕŨãĩñõũ",
        },
        TestData {
            encoded: &[
                0xc5, 0x41, 0xc5, 0x45, 0xc5, 0x49, 0xc5, 0x4f, 0xc5, 0x55, 0xc5, 0x61, 0xc5, 0x65,
                0xc5, 0x69, 0xc5, 0x6f, 0xc5, 0x75,
            ],
            decoded: "ĀĒĪŌŪāēīōū",
        },
        TestData {
            encoded: &[
                0xc6, 0x41, 0xc6, 0x47, 0xc6, 0x55, 0xc6, 0x61, 0xc6, 0x67, 0xc6, 0x75,
            ],
            decoded: "ĂĞŬăğŭ",
        },
        TestData {
            encoded: &[
                0xc7, 0x43, 0xc7, 0x45, 0xc7, 0x47, 0xc7, 0x49, 0xc7, 0x5a, 0xc7, 0x63, 0xc7, 0x65,
                0xc7, 0x67, 0xc7, 0x7a,
            ],
            decoded: "ĊĖĠİŻċėġż",
        },
        TestData {
            encoded: &[
                0xc8, 0x41, 0xc8, 0x45, 0xc8, 0x49, 0xc8, 0x4f, 0xc8, 0x55, 0xc8, 0x59, 0xc8, 0x61,
                0xc8, 0x65, 0xc8, 0x69, 0xc8, 0x6f, 0xc8, 0x75, 0xc8, 0x79,
            ],
            decoded: "ÄËÏÖÜŸäëïöüÿ",
        },
        TestData {
            encoded: &[0xca, 0x41, 0xca, 0x55, 0xca, 0x61, 0xca, 0x75],
            decoded: "ÅŮåů",
        },
        TestData {
            encoded: &[
                0xcb, 0x43, 0xcb, 0x47, 0xcb, 0x4b, 0xcb, 0x4c, 0xcb, 0x4e, 0xcb, 0x52, 0xcb, 0x53,
                0xcb, 0x54, 0xcb, 0x63, 0xcb, 0x6b, 0xcb, 0x6c, 0xcb, 0x6e, 0xcb, 0x72, 0xcb, 0x73,
                0xcb, 0x74,
            ],
            decoded: "ÇĢĶĻŅŖŞŢçķļņŗşţ",
        },
        TestData {
            encoded: &[0xcd, 0x4f, 0xcd, 0x55, 0xcd, 0x6f, 0xcd, 0x75],
            decoded: "ŐŰőű",
        },
        TestData {
            encoded: &[
                0xce, 0x41, 0xce, 0x45, 0xce, 0x49, 0xce, 0x55, 0xce, 0x61, 0xce, 0x65, 0xce, 0x69,
                0xce, 0x75,
            ],
            decoded: "ĄĘĮŲąęįų",
        },
        TestData {
            encoded: &[
                0xcf, 0x43, 0xcf, 0x44, 0xcf, 0x45, 0xcf, 0x4c, 0xcf, 0x4e, 0xcf, 0x52, 0xcf, 0x53,
                0xcf, 0x54, 0xcf, 0x5a, 0xcf, 0x63, 0xcf, 0x64, 0xcf, 0x65, 0xcf, 0x6c, 0xcf, 0x6e,
                0xcf, 0x72, 0xcf, 0x73, 0xcf, 0x74, 0xcf, 0x7a,
            ],
            decoded: "ČĎĚĽŇŘŠŤŽčďěľňřšťž",
        },
    ];

    let mut buf = [0u8; 512];

    for test in tests {
        let enc = encode::<Iso6937>(&test.decoded);
        assert_eq!(enc.as_slice(), test.encoded);

        let len = encode_to_slice::<Iso6937>(&test.decoded, &mut buf);
        assert_eq!(enc, &buf[.. len]);
    }

    for test in tests {
        let dec = decode::<Iso6937>(test.encoded);
        assert_eq!(test.decoded, dec.as_str());

        let len = decode_to_slice::<Iso6937>(test.encoded, &mut buf);
        assert_eq!(dec.as_bytes(), &buf[.. len]);
    }
}

#[test]
fn test_iso6937_buffer_overflow() {
    // encode_to_slice: buffer too small for result
    let src = "ÀÈÌ"; // 3 characters, each encodes to 2 bytes = 6 bytes
    let mut small_buf = [0u8; 4];
    let len = encode_to_slice::<Iso6937>(src, &mut small_buf);
    assert_eq!(len, 0);

    // encode_to_slice: buffer large enough
    let mut big_buf = [0u8; 16];
    let len = encode_to_slice::<Iso6937>(src, &mut big_buf);
    assert_eq!(len, 6);

    // decode_to_slice: buffer too small for result
    let encoded = &[0xc1, 0x41, 0xc1, 0x45, 0xc1, 0x49]; // ÀÈÌ
    let mut small_buf = [0u8; 4];
    let len = decode_to_slice::<Iso6937>(encoded, &mut small_buf);
    assert_eq!(len, 0);

    // decode_to_slice: buffer large enough
    let mut big_buf = [0u8; 16];
    let len = decode_to_slice::<Iso6937>(encoded, &mut big_buf);
    assert_eq!(len, 6); // 3 characters, 2 bytes each in UTF-8
}

#[test]
fn test_iso6937_fallback() {
    // encode_fallback: character outside ISO 6937 is encoded as '?'
    let src = "Hello 😀"; // emoji 😀 is not supported
    let enc = encode::<Iso6937>(src);
    assert_eq!(enc.as_slice(), b"Hello ?");

    // encode_fallback: cyrillic characters are not supported in ISO 6937
    let src = "Тест"; // cyrillic
    let enc = encode::<Iso6937>(src);
    assert_eq!(enc.as_slice(), b"????");

    // decode_fallback: invalid diacritical sequence
    // 0xC1 - diacritical mark, but followed by invalid character
    let encoded = &[0x48, 0x69, 0xC1, 0x21, 0x21]; // "Hi" + diacritical + '!!'
    let dec = decode::<Iso6937>(encoded);
    assert_eq!(dec.as_str(), "Hi�!");
}