trivet 3.1.0

The trivet Parser Library
Documentation
// Trivet
// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! Tests of the decoder module.

use crate::decoder::Decode;
use crate::decoder::Decoder;

// These should consist of 56 code points each.

const LENGTH: usize = 56;
const BYTES: &[u8] = &[
    0xe1, 0xb8, 0xbc, 0xc6, 0xa1, 0xe1, 0xb6, 0x89, 0xc3, 0xab, 0xe1, 0xb6, 0x86, 0x20, 0xc8, 0x8b,
    0xe1, 0xb9, 0x95, 0xc5, 0xa1, 0xe1, 0xb6, 0x99, 0xe1, 0xb9, 0x81, 0x20, 0xe1, 0xb8, 0x8d, 0xe1,
    0xbb, 0xa1, 0xe1, 0xb8, 0xbd, 0xc7, 0xad, 0xe1, 0xb5, 0xb3, 0x20, 0xca, 0x82, 0xc7, 0x90, 0xc5,
    0xa5, 0x20, 0xd3, 0x93, 0xe1, 0xb9, 0x81, 0xe1, 0xbb, 0x87, 0xe1, 0xba, 0x97, 0x2e, 0x0a, 0x4c,
    0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72,
    0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2e, 0x0a,
];
const BYTES_BOM: &[u8] = &[
    0xef, 0xbb, 0xbf, // UTF-8 BOM
    0xe1, 0xb8, 0xbc, 0xc6, 0xa1, 0xe1, 0xb6, 0x89, 0xc3, 0xab, 0xe1, 0xb6, 0x86, 0x20, 0xc8, 0x8b,
    0xe1, 0xb9, 0x95, 0xc5, 0xa1, 0xe1, 0xb6, 0x99, 0xe1, 0xb9, 0x81, 0x20, 0xe1, 0xb8, 0x8d, 0xe1,
    0xbb, 0xa1, 0xe1, 0xb8, 0xbd, 0xc7, 0xad, 0xe1, 0xb5, 0xb3, 0x20, 0xca, 0x82, 0xc7, 0x90, 0xc5,
    0xa5, 0x20, 0xd3, 0x93, 0xe1, 0xb9, 0x81, 0xe1, 0xbb, 0x87, 0xe1, 0xba, 0x97, 0x2e, 0x0a, 0x4c,
    0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72,
    0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2e, 0x0a,
];
const BYTES_UTF16LE: &[u8] = &[
    0xff, 0xfe, 0x3c, 0x1e, 0xa1, 0x01, 0x89, 0x1d, 0xeb, 0x00, 0x86, 0x1d, 0x20, 0x00, 0x0b, 0x02,
    0x55, 0x1e, 0x61, 0x01, 0x99, 0x1d, 0x41, 0x1e, 0x20, 0x00, 0x0d, 0x1e, 0xe1, 0x1e, 0x3d, 0x1e,
    0xed, 0x01, 0x73, 0x1d, 0x20, 0x00, 0x82, 0x02, 0xd0, 0x01, 0x65, 0x01, 0x20, 0x00, 0xd3, 0x04,
    0x41, 0x1e, 0xc7, 0x1e, 0x97, 0x1e, 0x2e, 0x00, 0x0a, 0x00, 0x4c, 0x00, 0x6f, 0x00, 0x72, 0x00,
    0x65, 0x00, 0x6d, 0x00, 0x20, 0x00, 0x69, 0x00, 0x70, 0x00, 0x73, 0x00, 0x75, 0x00, 0x6d, 0x00,
    0x20, 0x00, 0x64, 0x00, 0x6f, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x20, 0x00, 0x73, 0x00,
    0x69, 0x00, 0x74, 0x00, 0x20, 0x00, 0x61, 0x00, 0x6d, 0x00, 0x65, 0x00, 0x74, 0x00, 0x2e, 0x00,
    0x0a, 0x00,
];
const BYTES_UTF16BE: &[u8] = &[
    0xfe, 0xff, 0x1e, 0x3c, 0x01, 0xa1, 0x1d, 0x89, 0x00, 0xeb, 0x1d, 0x86, 0x00, 0x20, 0x02, 0x0b,
    0x1e, 0x55, 0x01, 0x61, 0x1d, 0x99, 0x1e, 0x41, 0x00, 0x20, 0x1e, 0x0d, 0x1e, 0xe1, 0x1e, 0x3d,
    0x01, 0xed, 0x1d, 0x73, 0x00, 0x20, 0x02, 0x82, 0x01, 0xd0, 0x01, 0x65, 0x00, 0x20, 0x04, 0xd3,
    0x1e, 0x41, 0x1e, 0xc7, 0x1e, 0x97, 0x00, 0x2e, 0x00, 0x0a, 0x00, 0x4c, 0x00, 0x6f, 0x00, 0x72,
    0x00, 0x65, 0x00, 0x6d, 0x00, 0x20, 0x00, 0x69, 0x00, 0x70, 0x00, 0x73, 0x00, 0x75, 0x00, 0x6d,
    0x00, 0x20, 0x00, 0x64, 0x00, 0x6f, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x20, 0x00, 0x73,
    0x00, 0x69, 0x00, 0x74, 0x00, 0x20, 0x00, 0x61, 0x00, 0x6d, 0x00, 0x65, 0x00, 0x74, 0x00, 0x2e,
    0x00, 0x0a,
];
const TEXT: &str = "Ḽơᶉëᶆ ȋṕšᶙṁ ḍỡḽǭᵳ ʂǐť ӓṁệẗ.\nLorem ipsum dolor sit amet.\n";

#[test]
fn read_eof() {
    // Test reading from an empty stream.  Nothing should fail here.
    let mut decode = Decode::new("".bytes().collect());
    assert_eq!(decode.next(), None);
    assert_eq!(decode.next(), None);
}

#[test]
fn read_eof_2() {
    // Test reading past the end of a stream.  Nothing should fail here.
    let mut decode = Decode::new("xxxx".bytes().collect());
    assert_eq!(decode.next(), Some('x'));
    assert_eq!(decode.next(), Some('x'));
    assert_eq!(decode.next(), Some('x'));
    assert_eq!(decode.next(), Some('x'));
    assert_eq!(decode.next(), None);
    assert_eq!(decode.next(), None);
}

#[test]
fn check_utf8_no_bom() {
    // Test decoding a UTF-8 sequence of bytes without a BOM.
    let decode = Decode::new(BYTES.to_vec());
    let mut res = String::new();
    for ch in decode {
        res.push(ch);
    }
    assert_eq!(res, TEXT);
}

#[test]
fn check_utf8_bom() {
    // Test decoding a UTF-8 sequence of bytes with a BOM.
    let decode = Decode::new(BYTES_BOM.to_vec());
    let mut res = String::new();
    for ch in decode {
        res.push(ch);
    }
    assert_eq!(res, TEXT);
}

#[test]
fn check_utf16le() {
    // Test decoding a UTF-16 LE sequence of bytes.
    let decode = Decode::new(BYTES_UTF16LE.to_vec());
    let mut res = String::new();
    for ch in decode {
        res.push(ch);
    }
    assert_eq!(res, TEXT);
}

#[test]
fn check_utf16be() {
    // Test decoding a UTF-16 BE sequence of bytes.
    let decode = Decode::new(BYTES_UTF16BE.to_vec());
    let mut res = String::new();
    for ch in decode {
        res.push(ch);
    }
    assert_eq!(res, TEXT);
}

#[test]
fn check_invalid_utf8() {
    // This list of tests is taken from the PHP documentation here:
    // https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php

    // Valid sequences.
    let valid: Vec<&[u8]> = vec![b"\xc3\xb1", b"\xe2\x82\xa1", b"\xf0\x90\x8c\xbc"];
    // Invalid sequences.
    let invalid: Vec<&[u8]> = vec![
        b"\xc3\x28",                 // <== Invalid
        b"\xa0\xa1",                 // <== Invalid sequence identifier
        b"\xe2\x28\xa1",             // <== Invalid at second octet
        b"\xe2\x82\x28",             // <== Invalid at third octet
        b"\xf0\x28\x8c\xbc",         // <== Invalid at second octet
        b"\xf0\x90\x28\xbc",         // <== Invalid at third octet
        b"\xf0\x28\x8c\x28",         // <== Invalid at fourth octet
        b"\xf8\xa1\xa1\xa1\xa1",     // <== NOT Unicode!
        b"\xfc\xa1\xa1\xa1\xa1\xa1", // <== NOT Unicode!
        b"\xf0\x90\x8c",             // <== Invalid; missing final octet
    ];

    // Test all valid sequences.
    for sequence in valid {
        let mut decoder = Decode::new(sequence.to_vec());
        let ch = decoder.next();
        if ch.is_none() {
            panic!("Failed at {:?}", sequence);
        }
        let sval = std::str::from_utf8(sequence).unwrap();
        let tval = ch.unwrap().to_string();
        assert_eq!(sval, tval);
    }

    // Test all invalid sequences.
    for sequence in invalid {
        let mut decoder = Decode::new(sequence.to_vec());
        let ch = decoder.next();
        if let Some(c) = ch {
            if c != char::REPLACEMENT_CHARACTER {
                panic!("Failed at {:?}, got {:?}", sequence, c);
            }
        }
        // As a check, Rust should also reject.  This doesn't really test Rust,
        // but tests our list of invalid sequences.  :-)
        if std::str::from_utf8(sequence).is_ok() {
            panic!("Rust incorrectly decoded invalid string {:?}", sequence);
        }
    }
}

#[test]
fn check_invalid_utf16() {
    // This sequence has one valid character followed by an invalid surrogate pair (second
    // half is missing).  The UTF-16 LE BOM is present.
    let test1 = b"\xff\xfe\x20\x00\x00\xd8";
    let mut decoder = Decode::new(test1.to_vec());
    assert_eq!(decoder.next(), Some(' '));
    assert_eq!(decoder.next(), Some(char::REPLACEMENT_CHARACTER));
    assert_eq!(decoder.next(), None);

    // This sequence has one valid character followed by an invalid surrogate pair (second
    // half is missing).  The UTF-16 BE BOM is present.
    let test1 = b"\xfe\xff\x00\x20\xd8\x00";
    let mut decoder = Decode::new(test1.to_vec());
    assert_eq!(decoder.next(), Some(' '));
    assert_eq!(decoder.next(), Some(char::REPLACEMENT_CHARACTER));
    assert_eq!(decoder.next(), None);

    // This sequence has one valid character followed by an invalid surrogate pair (out
    // of range).  The UTF-16 LE BOM is present.
    let test1 = b"\xff\xfe\x20\x00\x00\xd8\x00\x00";
    let mut decoder = Decode::new(test1.to_vec());
    assert_eq!(decoder.next(), Some(' '));
    assert_eq!(decoder.next(), Some(char::REPLACEMENT_CHARACTER));
    assert_eq!(decoder.next(), None);

    // This sequence has one valid character followed by an invalid surrogate pair (out
    // of range).  The UTF-16 BE BOM is present.
    let test1 = b"\xfe\xff\x00\x20\xd8\x00\x00\x00";
    let mut decoder = Decode::new(test1.to_vec());
    assert_eq!(decoder.next(), Some(' '));
    assert_eq!(decoder.next(), Some(char::REPLACEMENT_CHARACTER));
    assert_eq!(decoder.next(), None);

    // Repeat, but use fill_n instead.  We should still get two characters,
    // with the second being the replacement character.
    let mut target = ['\0'; 10];
    let mut decoder = Decode::new(test1.to_vec());
    assert_eq!(decoder.fill_n(10, &mut target), 2);
}

#[test]
fn valid_utf16_surrogate() {
    // Test a valid surrogate pair.
    let mut decoder = Decode::new(b"\xff\xfe\x00\xd8\x37\xdc".to_vec());
    assert_eq!(decoder.next(), Some('\u{10037}'));
    assert_eq!(decoder.next(), None);
    let mut decoder = Decode::new(b"\xfe\xff\xd8\x00\xdc\x37".to_vec());
    assert_eq!(decoder.next(), Some('\u{10037}'));
    assert_eq!(decoder.next(), None);
}

#[test]
fn chunks() {
    // Test retrieving chunk.
    let mut decoder = Decode::new(BYTES.to_vec());
    assert_eq!(decoder.next_n(LENGTH * 2 + 7).len(), LENGTH);
    let mut decoder = Decode::new(BYTES_BOM.to_vec());
    assert_eq!(decoder.next_n(LENGTH * 2 + 7).len(), LENGTH);
    let mut decoder = Decode::new(BYTES_UTF16BE.to_vec());
    assert_eq!(decoder.next_n(LENGTH * 2 + 7).len(), LENGTH);
    let mut decoder = Decode::new(BYTES_UTF16LE.to_vec());
    assert_eq!(decoder.next_n(LENGTH * 2 + 7).len(), LENGTH);

    // Testing filling.
    let mut decoder = Decode::new(BYTES.to_vec());
    let mut array = ['-'; 130];
    let chars: Vec<char> = TEXT.chars().collect();
    assert_eq!(decoder.fill_n(8, &mut array), 8);
    assert_eq!(
        String::from_iter(&array[0..8]),
        String::from_iter(&chars[0..8])
    );
    assert_eq!(array[9..130], ['-'; (130 - 9)]);
    assert_eq!(decoder.fill_n(LENGTH, &mut array), LENGTH - 8);
    assert_eq!(
        String::from_iter(&array[0..LENGTH - 8]),
        String::from_iter(&chars[8..LENGTH])
    );
    assert_eq!(array[LENGTH - 8..130], ['-'; (130 - (LENGTH - 8))]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES.to_vec());
    assert_eq!(decoder.fill_n(LENGTH * 6 + 7, &mut array), LENGTH);
    assert_eq!(
        String::from_iter(&array[0..LENGTH]),
        String::from_iter(&chars)
    );
    assert_eq!(array[LENGTH..130], ['-'; (130 - LENGTH)]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_BOM.to_vec());
    let mut array = ['-'; 130];
    let chars: Vec<char> = TEXT.chars().collect();
    assert_eq!(decoder.fill_n(8, &mut array), 8);
    assert_eq!(
        String::from_iter(&array[0..8]),
        String::from_iter(&chars[0..8])
    );
    assert_eq!(array[9..130], ['-'; (130 - 9)]);
    assert_eq!(decoder.fill_n(LENGTH, &mut array), LENGTH - 8);
    assert_eq!(
        String::from_iter(&array[0..LENGTH - 8]),
        String::from_iter(&chars[8..LENGTH])
    );
    assert_eq!(array[LENGTH - 8..130], ['-'; (130 - (LENGTH - 8))]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_BOM.to_vec());
    assert_eq!(decoder.fill_n(LENGTH * 6 + 7, &mut array), LENGTH);
    assert_eq!(
        String::from_iter(&array[0..LENGTH]),
        String::from_iter(&chars)
    );
    assert_eq!(array[LENGTH..130], ['-'; (130 - LENGTH)]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_UTF16BE.to_vec());
    let mut array = ['-'; 130];
    let chars: Vec<char> = TEXT.chars().collect();
    assert_eq!(decoder.fill_n(8, &mut array), 8);
    assert_eq!(
        String::from_iter(&array[0..8]),
        String::from_iter(&chars[0..8])
    );
    assert_eq!(array[9..130], ['-'; (130 - 9)]);
    assert_eq!(decoder.fill_n(LENGTH, &mut array), LENGTH - 8);
    assert_eq!(
        String::from_iter(&array[0..LENGTH - 8]),
        String::from_iter(&chars[8..LENGTH])
    );
    assert_eq!(array[LENGTH - 8..130], ['-'; (130 - (LENGTH - 8))]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_UTF16BE.to_vec());
    assert_eq!(decoder.fill_n(LENGTH * 6 + 7, &mut array), LENGTH);
    assert_eq!(
        String::from_iter(&array[0..LENGTH]),
        String::from_iter(&chars)
    );
    assert_eq!(array[LENGTH..130], ['-'; (130 - LENGTH)]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_UTF16LE.to_vec());
    let mut array = ['-'; 130];
    let chars: Vec<char> = TEXT.chars().collect();
    assert_eq!(decoder.fill_n(8, &mut array), 8);
    assert_eq!(
        String::from_iter(&array[0..8]),
        String::from_iter(&chars[0..8])
    );
    assert_eq!(array[9..130], ['-'; (130 - 9)]);
    assert_eq!(decoder.fill_n(LENGTH, &mut array), LENGTH - 8);
    assert_eq!(
        String::from_iter(&array[0..LENGTH - 8]),
        String::from_iter(&chars[8..LENGTH])
    );
    assert_eq!(array[LENGTH - 8..130], ['-'; (130 - (LENGTH - 8))]);

    // Testing filling.
    let mut decoder = Decode::new(BYTES_UTF16LE.to_vec());
    assert_eq!(decoder.fill_n(LENGTH * 6 + 7, &mut array), LENGTH);
    assert_eq!(
        String::from_iter(&array[0..LENGTH]),
        String::from_iter(&chars)
    );
    assert_eq!(array[LENGTH..130], ['-'; (130 - LENGTH)]);
}

#[test]
fn high_surrogates() {
    // Test some high Unicode values.  These are chosen for the fact they are
    // supported in many fonts, and should be visible here.  Just as a note, typing
    // these in Linux works like this: hold ctrl+shift and type u1034a and release.
    // It's honestly pretty damn cool.
    let text: Vec<char> = "𐍊 7".chars().collect();
    let encoded: Vec<&[u8]> = vec![
        // UTF-8, no BOM
        b"\xf0\x90\x8d\x8a\x20\x37",
        // UTF-16, big endian
        b"\xfe\xff\xd8\x00\xdf\x4a\x00\x20\x00\x37",
        // UTF-16, little endian
        b"\xff\xfe\x00\xd8\x4a\xdf\x20\x00\x37\x00",
    ];
    for bytes in encoded {
        let mut decoder = Decode::new(bytes.to_vec());
        assert_eq!(decoder.next_n(10), text);
    }
}