extern crate stackfmt;
extern crate std;
use utf8conv::*;
use rand::SeedableRng;
use rand::rngs::SmallRng;
use rand::RngCore;
use core::str;
fn verify_style3(par: &mut FromUtf8, b1:& [u8], b2:& [u8], b3:& [u8], b4:& [u8], truth: &str) {
let mut panic_buf = [0u8; 12000];
let mut test_char_len:usize = 0;
let mut truth_iter = truth.char_indices();
par.reset_parser();
par.set_is_last_buffer(false); for stage in 0 .. 4 {
let mut byte_ref_iter: std::slice::Iter<u8>;
match stage {
0 => {
byte_ref_iter = b1.iter();
}
1 => {
byte_ref_iter = b2.iter();
}
2 => {
byte_ref_iter = b3.iter();
}
_ => {
byte_ref_iter = b4.iter();
par.set_is_last_buffer(true);
}
}
let mut glue_iter = utf8_ref_iter_to_utf8_iter(& mut byte_ref_iter);
for test_ch in par.utf8_to_char_with_iter(& mut glue_iter) {
let test_utf32 = test_ch as u32;
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
let truth_utf32 = truth_ch as u32;
if test_utf32 != truth_utf32 {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is different than the test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x} vs truth {:#08x}"
,truth, test_char_len, test_utf32, truth_utf32));
panic!("\n{}\n", formatted);
}
}
Option::None => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is shorter than the combined test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x}"
, truth, test_char_len, test_utf32));
panic!("\n{}\n",formatted);
}
}
test_char_len += 1;
}
if par.is_last_buffer() {
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is longer than the test vectors (length {}).\nTruth string:{}\nTruth at index {} has code value {:#08x}"
, test_char_len, truth, test_char_len+1, (truth_ch as u32)));
panic!("\n{}\n", formatted);
}
Option::None => {
}
}
}
}
}
fn verify_style2(par: &mut FromUtf8, b1:& [u8], b2:& [u8], b3:& [u8], b4:& [u8], truth: &str) {
let mut panic_buf = [0u8; 12000];
let mut test_char_len:usize = 0;
let mut truth_iter = truth.char_indices();
par.reset_parser();
par.set_is_last_buffer(false); for stage in 0 .. 4 {
let mut byte_ref_iter: std::slice::Iter<u8>;
match stage {
0 => {
byte_ref_iter = b1.iter();
}
1 => {
byte_ref_iter = b2.iter();
}
2 => {
byte_ref_iter = b3.iter();
}
_ => {
byte_ref_iter = b4.iter();
par.set_is_last_buffer(true);
}
}
for test_ch in par.utf8_ref_to_char_with_iter(& mut byte_ref_iter) {
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
if test_ch != truth_ch {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is different than the test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x} vs truth {:#08x}"
,truth, test_char_len, (test_ch as u32), (truth_ch as u32)));
panic!("\n{}\n", formatted);
}
}
Option::None => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is shorter than the combined test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x}"
, truth, test_char_len, (test_ch as u32)));
panic!("\n{}\n",formatted);
}
}
test_char_len += 1;
}
if par.is_last_buffer() {
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is longer than the test vectors (length {}).\nTruth string:{}\nTruth at index {} has code value {:#08x}"
, test_char_len, truth, test_char_len+1, (truth_ch as u32)));
panic!("\n{}\n", formatted);
}
Option::None => {
}
}
}
}
}
fn verify_with_string(par: &mut FromUtf8, b1:& [u8], b2:& [u8], b3:& [u8], b4:& [u8], truth: &str) {
let mut panic_buf = [0u8; 12000];
let mut test_char_len:usize = 0;
let mut truth_iter = truth.char_indices();
par.reset_parser();
par.set_is_last_buffer(false);
for stage in 0 .. 4 {
let mut the_slice: &[u8];
match stage {
0 => {
the_slice = &b1;
}
1 => {
the_slice = &b2;
}
2 => {
the_slice = &b3;
}
_ => {
the_slice = &b4;
par.set_is_last_buffer(true);
}
}
loop {
match par.utf8_to_char(the_slice) {
Result::Ok((slice_pos, test_ch)) => {
the_slice = slice_pos;
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
if test_ch != truth_ch {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is different than the test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x} vs truth {:#08x}"
,truth, test_char_len, (test_ch as u32), (truth_ch as u32)));
panic!("\n{}\n", formatted);
}
}
Option::None => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is shorter than the combined test vectors.\nTruth string:{}\nTest vector at index {} has code value {:#08x}"
, truth, test_char_len, (test_ch as u32)));
panic!("\n{}\n",formatted);
}
}
test_char_len += 1;
}
Result::Err(en) => {
match en {
MoreEnum::More(i) => {
if i == 0 {
match truth_iter.next() {
Option::Some((_pos, truth_ch)) => {
let formatted: &str = stackfmt::fmt_truncate(&mut panic_buf, format_args!(
"The truth string is longer than the test vectors (length {}).\nTruth string:{}\nTruth at index {} has code value {:#08x}"
, test_char_len, truth, test_char_len+1, (truth_ch as u32)));
panic!("\n{}\n", formatted);
}
Option::None => {
return;
}
}
}
else {
break;
}
}
}
}
}
}
}
}
#[test]
fn test_utf8parsing_aaa() {
let mut par:FromUtf8 = FromUtf8::new();
println!("case 1: all empty");
let t1 = "";
verify_with_string(&mut par, "".as_bytes(), "".as_bytes(), "".as_bytes(), "".as_bytes(), &t1);
assert!(!par.has_invalid_sequence());
println!("case 2, different length ASCII");
let t1 = "abcdef\x7f\t\r\n";
verify_with_string(&mut par, "a".as_bytes(), "bc".as_bytes(), "def".as_bytes(), "\x7f\t\r\n".as_bytes(), &t1);
assert!(!par.has_invalid_sequence());
println!("case 3: multi-language");
let t1 = "寒い,감기,frío,студен";
verify_with_string(&mut par, "寒い,".as_bytes(), "감기,".as_bytes(), "frío,".as_bytes(), "студен".as_bytes(), &t1);
assert!(!par.has_invalid_sequence());
println!("case 4: emoji and symbols");
let t1 = "😀🐔🐣🇧🇷🇨🇦元∰⇲";
verify_with_string(&mut par, "😀".as_bytes(), "🐔🐣".as_bytes(), "🇧🇷🇨🇦".as_bytes(), "元∰⇲".as_bytes(), &t1);
assert!(!par.has_invalid_sequence());
println!("case 5: long text");
let t1 = "The red fox jumped over the white fence in a stormy morning with seven chasing servants";
verify_with_string(&mut par, "The red fox jumped over the white fence in a stormy morning with seven chasing servants".as_bytes(),
"".as_bytes(), "".as_bytes(), "".as_bytes(), &t1);
assert!(!par.has_invalid_sequence());
par.reset_parser();
println!("case 6: decode across buffer boundaries: ED/9F-bf, C2 / 80");
let t1 = "\u{D7FF}\u{80}";
verify_with_string(&mut par, & [0xEDu8], & [0x9Fu8, 0xbfu8], & [0xC2u8], & [0x80u8], &t1);
assert!(!par.has_invalid_sequence());
println!("case 7: long decode error followed by 2 byte decode");
let t1 = "\u{FFFD}\u{FFFD}\u{7FF}";
verify_with_string(&mut par, &[0xF0u8], "".as_bytes(), & [0x85u8], &[0xDFu8, 0xBFu8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 8: decode error in last byte, then an ASCII");
let t1 = "\u{FFFD}\u{7f}?";
verify_with_string(&mut par, & [0xF4u8], & [0x8Fu8], & [0x80u8, 0x7fu8], & [0x3fu8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 9: overlong encoding of the euro sign");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}";
verify_with_string(&mut par, & [0xF0u8], & [0x82u8], & [0x82u8], & [0xACu8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 10: invalid bytes from F5 to FF");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}";
verify_with_string(&mut par, & [245u8,246u8,247u8,248u8,249u8,250u8,251u8,252u8,253u8,254u8,255u8], & [], & [], & [], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 11: accept some non-characters");
let t1 = "\u{FFFE}\u{FFFF}\u{1FFFF}\u{2FFFE}\u{2FFFF}";
verify_with_string(&mut par, "\u{FFFE}\u{FFFF}\u{1FFFF}\u{2FFFE}\u{2FFFF}".as_bytes(), & [], & [], & [], &t1);
assert!(! par.has_invalid_sequence());
par.reset_parser();
println!("case 12: unicode 0, 16, 32, 48 ...");
let t1 = "\u{0}\u{16}\u{32}\u{48}\u{64}\u{80}\u{96}\u{112}\u{128}\u{144}\u{160}";
verify_with_string(&mut par, "\u{0}\u{16}\u{32}\u{48}\u{64}\u{80}\u{96}\u{112}\u{128}\u{144}\u{160}".as_bytes(), & [], & [], & [], &t1);
assert!(! par.has_invalid_sequence());
par.reset_parser();
println!("case 13: < D0 D0 >");
let t1 = "<\u{FFFD}\u{FFFD}>";
verify_with_string(&mut par, "<".as_bytes(), & [0xD0u8], & [0xD0u8], ">".as_bytes(), &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 14: E1 A0 C0");
let t1 = "\u{FFFD}\u{FFFD}\\";
verify_with_string(&mut par, & [0xE1u8], & [0xA0u8], & [], & [0xC0, 0x5c], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 15: over long null characters");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}";
verify_with_string(&mut par, & [0xE0u8,128u8,128u8], & [0xF0,128u8,128u8,128u8], & [0xC0u8,128u8], & [], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 16: +U10000");
let t1 = "\u{10000}";
verify_with_string(&mut par, & [0b1111_0000u8], & [0b1001_0000u8], & [0b1000_0000u8], & [0b1000_0000u8], &t1);
assert!(! par.has_invalid_sequence());
par.reset_parser();
println!("case 17: double quote, F0, double quote, NL");
let t1 = "\"\u{FFFD}\"\n";
verify_with_string(&mut par, & [34u8], & [0xF0u8], & [34u8], & [10u8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 18: +UD800");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\n";
verify_with_string(&mut par, & [0xEDu8], & [0xA0u8], & [0x80u8], & [10u8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 19: +UDFFF");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\r";
verify_with_string(&mut par, & [0xEDu8], & [0xbfu8], & [0xbfu8], & [13u8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 20: 0x80");
let t1 = "G\u{FFFD}R\r";
verify_with_string(&mut par, & [71u8], & [0x80u8], & [82u8], & [13u8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 21: 0xC0, 0xC1");
let t1 = "G\u{FFFD}\u{FFFD}\n";
verify_with_string(&mut par, & [71u8], & [0xC1u8], & [0xC0u8], & [10u8], &t1);
assert!(par.has_invalid_sequence());
par.reset_parser();
println!("case 22: U+110000");
let t1 = "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\n";
verify_with_string(&mut par, & [0xF5u8], & [0x80u8, 0x80u8], & [0x80u8], & [10u8], &t1);
assert!(par.has_invalid_sequence());
}
fn four_random_slice<'a>(byte_slice: &'a [u8], rng: &mut SmallRng) -> (&'a [u8], &'a [u8], &'a [u8], &'a [u8])
{
let remainder_bound:usize = match byte_slice.len() / 4 {
0usize => { 1usize }
n => { n }
};
let seg_1_len:usize = (rng.next_u32() as usize) % remainder_bound;
let seg_2_len:usize = (rng.next_u32() as usize) % remainder_bound;
let seg_3_len:usize = (rng.next_u32() as usize) % remainder_bound;
let seg_4_len:usize = byte_slice.len() - seg_1_len - seg_2_len - seg_3_len;
(& byte_slice[0 .. seg_1_len],
& byte_slice[seg_1_len .. seg_1_len + seg_2_len],
& byte_slice[seg_1_len + seg_2_len .. seg_1_len + seg_2_len + seg_3_len],
& byte_slice[seg_1_len + seg_2_len + seg_3_len .. seg_1_len + seg_2_len + seg_3_len + seg_4_len])
}
fn spread_noise(byte_slice: &mut [u8], rng: &mut SmallRng) {
for indx in 0 .. byte_slice.len() {
if (rng.next_u32() % 10) == 0 {
rng.fill_bytes(&mut byte_slice[indx .. indx + 1]);
}
}
}
fn char_slice_to_u8_slice<'a>(char_slice: & [char], u8_slice: &'a mut [u8]) -> &'a mut [u8] {
let mut cur_u8_len:usize = 0;
for char_indx in 0usize .. char_slice.len() {
let target = char_slice[char_indx].encode_utf8(&mut u8_slice[cur_u8_len .. ]);
cur_u8_len += target.len();
}
&mut u8_slice[0 .. cur_u8_len]
}
fn copy_u8_slice_to_u8_slice(from_slice: & [u8], to_slice: & mut [u8]) {
let len = from_slice.len();
for indx in 0 .. len {
to_slice[indx] = from_slice[indx];
}
}
fn copy_replacement_to_u8_slice(to_slice: & mut [u8]) {
to_slice[0] = REPLACE_PART1;
to_slice[1] = REPLACE_PART2;
to_slice[2] = REPLACE_PART3;
}
fn validify_u8_buffer<'a>(u8_slice: & [u8], dest_slice: &'a mut [u8]) -> &'a mut [u8] {
let mut cur_slice = u8_slice;
let mut output_len:usize = 0;
while cur_slice.len() > 0 {
match str::from_utf8(cur_slice) {
Ok(str_ref) => {
let ref_len = str_ref.len();
copy_u8_slice_to_u8_slice(&cur_slice[0 .. ref_len], & mut dest_slice[output_len ..]);
cur_slice = &cur_slice[ref_len ..];
output_len += ref_len;
}
Err(en) => {
let valid_up_to = en.valid_up_to();
copy_u8_slice_to_u8_slice(&cur_slice[0 .. valid_up_to], &mut dest_slice[output_len ..]);
cur_slice = & cur_slice[valid_up_to ..];
output_len += valid_up_to;
match en.error_len() {
Option::Some(err_len) => {
copy_replacement_to_u8_slice(& mut dest_slice[output_len ..]);
output_len += 3;
cur_slice = & cur_slice[err_len .. ];
}
Option::None => {
copy_replacement_to_u8_slice(& mut dest_slice[output_len ..]);
output_len += 3;
return & mut dest_slice[0 .. output_len]; }
}
}
}
}
& mut dest_slice[0 .. output_len]
}
fn make_random_string(char_slice: &mut [char], rng: &mut SmallRng) {
for indx in 0usize .. char_slice.len() {
let val:u32 = rng.next_u32() % 0x111000u32;
match char::from_u32(val) {
Option::Some(ch) => {
char_slice[indx] = ch;
}
Option::None => {
assert!(!((val > 0xffffu32) && (val <= 0x10ffffu32)));
char_slice[indx] = char::REPLACEMENT_CHARACTER;
}
}
}
}
#[test]
fn test_utf8_monkey1() {
let mut par:FromUtf8 = FromUtf8::new();
let mut rng = SmallRng::seed_from_u64(0x17e4bd3a163c10e4u64);
let mut char_buf = [char::REPLACEMENT_CHARACTER; 160];
let mut byte_buf = [0u8; 160 * 4];
let mut byte_buf2 = [0u8; 160 * 12];
for _indx in 0 .. 40000 {
make_random_string(& mut char_buf, &mut rng);
let orig_slice: &mut [u8] = char_slice_to_u8_slice(&char_buf, & mut byte_buf);
spread_noise(orig_slice, & mut rng);
let (frag1, frag2, frag3, frag4) = four_random_slice(orig_slice, &mut rng);
let mod_buf2 = validify_u8_buffer(orig_slice, & mut byte_buf2);
match str::from_utf8(mod_buf2) {
Ok(mystr) => {
verify_with_string(&mut par, frag1, frag2, frag3, frag4, mystr);
}
Err(_) => {
panic!("Unexpected from_utf8() failure");
}
}
}
}
#[test]
fn test_utf8_monkey2() {
let mut par:FromUtf8 = FromUtf8::new();
let mut rng = SmallRng::seed_from_u64(0x37e47d3a163c62b7u64);
let mut char_buf = [char::REPLACEMENT_CHARACTER; 160];
let mut byte_buf = [0u8; 160 * 4];
let mut byte_buf2 = [0u8; 160 * 12];
for _indx in 0 .. 40000 {
make_random_string(& mut char_buf, &mut rng);
let orig_slice: &mut [u8] = char_slice_to_u8_slice(&char_buf, & mut byte_buf);
spread_noise(orig_slice, & mut rng);
let (frag1, frag2, frag3, frag4) = four_random_slice(orig_slice, &mut rng);
let mod_buf2 = validify_u8_buffer(orig_slice, & mut byte_buf2);
match str::from_utf8(mod_buf2) {
Ok(mystr) => {
verify_style2(&mut par, frag1, frag2, frag3, frag4, mystr);
}
Err(_) => {
panic!("Unexpected from_utf8() failure");
}
}
}
}
#[test]
fn test_utf8_monkey3() {
let mut par:FromUtf8 = FromUtf8::new();
let mut rng = SmallRng::seed_from_u64(0x87e17f3a9c3a1a07u64);
let mut char_buf = [char::REPLACEMENT_CHARACTER; 160];
let mut byte_buf = [0u8; 160 * 4];
let mut byte_buf2 = [0u8; 160 * 12];
for _indx in 0 .. 40000 {
make_random_string(& mut char_buf, &mut rng);
let orig_slice: &mut [u8] = char_slice_to_u8_slice(&char_buf, & mut byte_buf);
spread_noise(orig_slice, & mut rng);
let (frag1, frag2, frag3, frag4) = four_random_slice(orig_slice, &mut rng);
let mod_buf2 = validify_u8_buffer(orig_slice, & mut byte_buf2);
match str::from_utf8(mod_buf2) {
Ok(mystr) => {
verify_style3(&mut par, frag1, frag2, frag3, frag4, mystr);
}
Err(_) => {
panic!("Unexpected from_utf8() failure");
}
}
}
}