use std::ops::{Range, RangeInclusive};
use crate::lex_iter_err;
use super::lexer_error::LexerIterResult;
#[derive(Clone, Copy)]
pub struct CharIterator<'c>
{
chars: &'c [u8],
utf_index: usize,
real_index: usize,
cur_char: Option<char>,
cur_real_index: usize,
}
impl<'c> CharIterator<'c>
{
pub
fn new(chars: &'c str) -> Self
{
return
Self{
chars: chars.as_bytes(),
utf_index: 0,
real_index: 0,
cur_char: None,
cur_real_index: 0
};
}
pub
fn current(&self) -> Option<char>
{
return self.cur_char.clone();
}
pub
fn slice_to_string(&self, r: Range<usize>) -> &str
{
return unsafe { std::str::from_utf8_unchecked(&self.chars[r]) };
}
pub
fn slice_to_string_inc(&self, r: RangeInclusive<usize>) -> &str
{
return unsafe { std::str::from_utf8_unchecked(&self.chars[r]) };
}
pub
fn snapshot(&self) -> CharIterator<'c>
{
return *self;
}
pub
fn restore(&mut self, snap: Self)
{
self.utf_index = snap.utf_index;
self.real_index = snap.real_index;
self.cur_char = snap.cur_char;
self.cur_real_index = snap.cur_real_index;
}
pub
fn next(&mut self) -> LexerIterResult<Option<char>>
{
let p = self.extract_char(false)?;
self.cur_char = p.clone();
return Ok(p);
}
pub
fn peek(&mut self) -> LexerIterResult<Option<char>>
{
return self.extract_char(true);
}
pub
fn get_index(&self) -> usize
{
return self.utf_index;
}
pub
fn get_byte_index(&self) -> usize
{
return self.cur_real_index;
}
pub
fn get_read_index(&self) -> usize
{
return self.real_index;
}
fn extract_char(&mut self, peek: bool) -> LexerIterResult<Option<char>>
{
if self.real_index >= self.chars.len()
{
return Ok(None);
}
let start = self.real_index;
let first = self.chars[self.real_index];
let (off, tmp) =
if (first & 0x80) == 0x00
{
(1 as usize, first as u32)
}
else if (first & 0xF8) == 0xF0
{
if self.chars.len() < start + 4
{
lex_iter_err!(self.real_index, "UTF(32) expected 4 bytes to decode UTF-8 (0xF8), \
self.chars.len: '{}' < start+4 '{}'", self.chars.len(), (start+4));
}
match (self.chars[start], self.chars[start+1])
{
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) |
(0xF4, 0x80..=0x8F) => {}
_ =>
{
lex_iter_err!(self.real_index, "UTF(32) incorrect sequence, offset: '{}' offset+1: '{}', \
seq: '{:X}' '{:X}", start, start+1, self.chars[start], self.chars[start+1]);
}
}
if self.chars[start+2] & 0b11000000 != 0b10000000
{
lex_iter_err!(self.real_index, "UTF(32) incorrect masked value offset: '{}', mask: 0xC0,
res: '{:X}' != 0x80", start+2, (self.chars[start+2] & 0b11000000));
}
if self.chars[start+3] & 0b11000000 != 0b10000000
{
lex_iter_err!(self.real_index, "UTF(32) incorrect masked value offset: '{}', mask: 0xC0,
res: '{:X}' != 0x80", start+3, (self.chars[start+3] & 0b11000000));
}
(4 as usize, (((self.chars[start] & 0b00000111) as u32) << 18) |
(((self.chars[start+1] & 0b00111111) as u32) << 12) |
(((self.chars[start+2] & 0b00111111) as u32) << 6) |
((self.chars[start+3] & 0b00111111) as u32))
}
else if (first & 0xF0) == 0xE0
{
if self.chars.len() < start + 3
{
lex_iter_err!(self.real_index, "UTF(24) expected 3 bytes to decode UTF-8 (0xF0), \
self.chars.len: '{}' < start+3 '{}'", self.chars.len(), (start+3));
}
match (self.chars[start], self.chars[start+1])
{
(0xE0, 0xA0..=0xBF) | (0xE1..=0xEC, 0x80..=0xBF) |
(0xED, 0x80..=0x9F) | (0xEE..=0xEF, 0x80..=0xBF) => {}
_ =>
{
lex_iter_err!(self.real_index, "UTF(24) incorrect sequence, offset: '{}' offset+1: '{}', \
seq: '{:X}' '{:X}", start, start+1, self.chars[start], self.chars[start+1]);
}
}
if self.chars[start+2] & 0b11000000 != 0b10000000
{
lex_iter_err!(self.real_index, "UTF(24) incorrect masked value offset: '{}', mask: 0xC0,
res: '{:X}' != 0x80", start+2, (self.chars[start+2] & 0b11000000));
}
(3 as usize, (((self.chars[start] & 0b00001111) as u32) << 12) |
(((self.chars[start+1] & 0b00111111) as u32) << 6) |
((self.chars[start+2] & 0b00111111) as u32))
}
else if (first & 0xE0) == 0xC0
{
if self.chars.len() < start + 2
{
lex_iter_err!(self.real_index, "UTF(16) expected 2 bytes to decode UTF-8 (0xE0), \
self.chars.len: '{}' < start+2 '{}'", self.chars.len(), (start+2));
}
if (0xC2..=0xDF).contains(&self.chars[start]) == false
{
lex_iter_err!(self.real_index, "UTF(16) incorrect sequence, offset: '{}', \
seq: '{:X}'", start, self.chars[start]);
}
if self.chars[start+1] & 0b11000000 != 0b10000000
{
lex_iter_err!(self.real_index, "UTF(16) incorrect masked value offset: '{}', mask: 0xC0,
res: '{:X}' != 0x80", start+1, (self.chars[start+1] & 0b11000000));
}
(2 as usize, (((self.chars[start] & 0b00011111) as u32) << 6) | ((self.chars[start+1] & 0b00111111) as u32))
}
else
{
lex_iter_err!(self.real_index, "UTF(?) unknown UTF-8 value ({})", first);
};
let ch = unsafe { char::from_u32_unchecked(tmp) };
if peek == false
{
self.cur_real_index = self.real_index;
self.utf_index += 1;
self.real_index += off;
}
return Ok(Some(ch));
}
}
#[test]
fn test()
{
fn inline_test() -> LexerIterResult<()>
{
let t = "УВ UTF-8";
let mut c = CharIterator::new(t);
assert_eq!(c.peek()?, Some('У'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.peek()?, Some('У'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.next()?, Some('У'));
assert_eq!(c.get_read_index(), 2);
assert_eq!(c.get_index(), 1);
assert_eq!(c.next()?, Some('В'));
assert_eq!(c.get_read_index(), 4);
assert_eq!(c.get_index(), 2);
assert_eq!(c.peek()?, Some(' '));
assert_eq!(c.get_read_index(), 4);
assert_eq!(c.get_index(), 2);
assert_eq!(c.next()?, Some(' '));
assert_eq!(c.get_read_index(), 5);
assert_eq!(c.get_index(), 3);
assert_eq!(c.next()?, Some('U'));
assert_eq!(c.get_read_index(), 6);
assert_eq!(c.get_index(), 4);
assert_eq!(c.next()?, Some('T'));
assert_eq!(c.get_read_index(), 7);
assert_eq!(c.get_index(), 5);
assert_eq!(c.next()?, Some('F'));
assert_eq!(c.get_read_index(), 8);
assert_eq!(c.get_index(), 6);
assert_eq!(c.next()?, Some('-'));
assert_eq!(c.get_read_index(), 9);
assert_eq!(c.get_index(), 7);
assert_eq!(c.peek()?, Some('8'));
assert_eq!(c.get_read_index(), 9);
assert_eq!(c.get_index(), 7);
assert_eq!(c.next()?, Some('8'));
assert_eq!(c.get_read_index(), 10);
assert_eq!(c.get_index(), 8);
assert_eq!(c.next()?, None);
assert_eq!(c.peek()?, None);
return Ok(());
}
let res = inline_test();
assert_eq!(res.is_ok(), true, "error: '{}'", res.err().unwrap());
}
#[test]
fn test2()
{
fn inline_test() -> LexerIterResult<()>
{
let t = "€€€";
let mut c = CharIterator::new(t);
assert_eq!(c.peek()?, Some('€'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.peek()?, Some('€'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.next()?, Some('€'));
assert_eq!(c.get_read_index(), 3);
assert_eq!(c.get_index(), 1);
assert_eq!(c.next()?, Some('€'));
assert_eq!(c.get_read_index(), 6);
assert_eq!(c.get_index(), 2);
assert_eq!(c.next()?, Some('€'));
assert_eq!(c.get_read_index(), 9);
assert_eq!(c.get_index(), 3);
assert_eq!(c.next()?, None);
assert_eq!(c.peek()?, None);
return Ok(());
}
let res = inline_test();
assert_eq!(res.is_ok(), true, "error: '{}'", res.err().unwrap());
}
#[test]
fn test3()
{
fn inline_test() -> LexerIterResult<()>
{
let t = "𐍈𐍈𐍈 𐍈";
let mut c = CharIterator::new(t);
assert_eq!(c.peek()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.peek()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 0);
assert_eq!(c.get_index(), 0);
assert_eq!(c.next()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 4);
assert_eq!(c.get_index(), 1);
assert_eq!(c.next()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 8);
assert_eq!(c.get_index(), 2);
assert_eq!(c.next()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 12);
assert_eq!(c.get_index(), 3);
assert_eq!(c.next()?, Some(' '));
assert_eq!(c.get_read_index(), 13);
assert_eq!(c.get_index(), 4);
assert_eq!(c.next()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 17);
assert_eq!(c.get_index(), 5);
assert_eq!(c.next()?, None);
assert_eq!(c.peek()?, None);
return Ok(());
}
let res = inline_test();
assert_eq!(res.is_ok(), true, "error: '{}'", res.err().unwrap());
}
#[test]
fn test4()
{
fn inline_test() -> LexerIterResult<()>
{
let t = "𐍈€ Ю";
let mut c = CharIterator::new(t);
assert_eq!(c.next()?, Some('𐍈'));
assert_eq!(c.get_read_index(), 4);
assert_eq!(c.get_index(), 1);
assert_eq!(c.peek()?, Some('€'));
assert_eq!(c.get_read_index(), 4);
assert_eq!(c.get_index(), 1);
assert_eq!(c.next()?, Some('€'));
assert_eq!(c.get_read_index(), 7);
assert_eq!(c.get_index(), 2);
assert_eq!(c.next()?, Some(' '));
assert_eq!(c.get_read_index(), 8);
assert_eq!(c.get_index(), 3);
assert_eq!(c.next()?, Some('Ю'));
assert_eq!(c.get_read_index(), 10);
assert_eq!(c.get_index(), 4);
assert_eq!(c.next()?, None);
assert_eq!(c.peek()?, None);
return Ok(());
}
let res = inline_test();
assert_eq!(res.is_ok(), true, "error: '{}'", res.err().unwrap());
}