#[derive(Debug, Clone, Copy)]
pub struct StringCursor {
pub(crate) ptr: *const u8,
pub position: usize,
}
impl Default for StringCursor {
fn default() -> Self {
Self {
ptr: std::ptr::null(),
position: 0,
}
}
}
pub trait StrDrive: Copy {
fn count(&self) -> usize;
fn create_cursor(&self, n: usize) -> StringCursor;
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize);
fn advance(cursor: &mut StringCursor) -> u32;
fn peek(cursor: &StringCursor) -> u32;
fn skip(cursor: &mut StringCursor, n: usize);
fn back_advance(cursor: &mut StringCursor) -> u32;
fn back_peek(cursor: &StringCursor) -> u32;
fn back_skip(cursor: &mut StringCursor, n: usize);
}
impl<'a> StrDrive for &'a [u8] {
#[inline]
fn count(&self) -> usize {
self.len()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
StringCursor {
ptr: self[n..].as_ptr(),
position: n,
}
}
#[inline]
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
cursor.position = n;
cursor.ptr = self[n..].as_ptr();
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { cursor.ptr = cursor.ptr.add(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
unsafe { cursor.ptr = cursor.ptr.add(n) };
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { cursor.ptr = cursor.ptr.sub(1) };
unsafe { *cursor.ptr as u32 }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
unsafe { *cursor.ptr.offset(-1) as u32 }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
unsafe { cursor.ptr = cursor.ptr.sub(n) };
}
}
impl StrDrive for &str {
#[inline]
fn count(&self) -> usize {
self.chars().count()
}
#[inline]
fn create_cursor(&self, n: usize) -> StringCursor {
let mut cursor = StringCursor {
ptr: self.as_ptr(),
position: 0,
};
Self::skip(&mut cursor, n);
cursor
}
#[inline]
fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) {
if cursor.ptr.is_null() || cursor.position > n {
*cursor = Self::create_cursor(self, n);
} else if cursor.position < n {
Self::skip(cursor, n - cursor.position);
}
}
#[inline]
fn advance(cursor: &mut StringCursor) -> u32 {
cursor.position += 1;
unsafe { next_code_point(&mut cursor.ptr) }
}
#[inline]
fn peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point(&mut ptr) }
}
#[inline]
fn skip(cursor: &mut StringCursor, n: usize) {
cursor.position += n;
for _ in 0..n {
unsafe { next_code_point(&mut cursor.ptr) };
}
}
#[inline]
fn back_advance(cursor: &mut StringCursor) -> u32 {
cursor.position -= 1;
unsafe { next_code_point_reverse(&mut cursor.ptr) }
}
#[inline]
fn back_peek(cursor: &StringCursor) -> u32 {
let mut ptr = cursor.ptr;
unsafe { next_code_point_reverse(&mut ptr) }
}
#[inline]
fn back_skip(cursor: &mut StringCursor, n: usize) {
cursor.position -= n;
for _ in 0..n {
unsafe { next_code_point_reverse(&mut cursor.ptr) };
}
}
}
#[inline]
unsafe fn next_code_point(ptr: &mut *const u8) -> u32 {
let x = **ptr;
*ptr = ptr.offset(1);
if x < 128 {
return x as u32;
}
let init = utf8_first_byte(x, 2);
let y = **ptr;
*ptr = ptr.offset(1);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
let z = **ptr;
*ptr = ptr.offset(1);
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
let w = **ptr;
*ptr = ptr.offset(1);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
ch
}
#[inline]
unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 {
*ptr = ptr.offset(-1);
let w = match **ptr {
next_byte if next_byte < 128 => return next_byte as u32,
back_byte => back_byte,
};
let mut ch;
*ptr = ptr.offset(-1);
let z = **ptr;
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
*ptr = ptr.offset(-1);
let y = **ptr;
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
*ptr = ptr.offset(-1);
let x = **ptr;
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
}
ch = utf8_acc_cont_byte(ch, w);
ch
}
#[inline]
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
#[inline]
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
#[inline]
const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}
const CONT_MASK: u8 = 0b0011_1111;
const fn is_py_ascii_whitespace(b: u8) -> bool {
matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
}
#[inline]
pub(crate) fn is_word(ch: u32) -> bool {
ch == '_' as u32
|| u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_space(ch: u32) -> bool {
u8::try_from(ch)
.map(is_py_ascii_whitespace)
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_digit(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_alnum(ch: u32) -> bool {
u8::try_from(ch)
.map(|x| x.is_ascii_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_loc_word(ch: u32) -> bool {
ch == '_' as u32 || is_loc_alnum(ch)
}
#[inline]
pub(crate) fn is_linebreak(ch: u32) -> bool {
ch == '\n' as u32
}
#[inline]
pub fn lower_ascii(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_lowercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn lower_locate(ch: u32) -> u32 {
lower_ascii(ch)
}
#[inline]
pub(crate) fn upper_locate(ch: u32) -> u32 {
u8::try_from(ch)
.map(|x| x.to_ascii_uppercase() as u32)
.unwrap_or(ch)
}
#[inline]
pub(crate) fn is_uni_digit(ch: u32) -> bool {
char::try_from(ch)
.map(|x| x.is_ascii_digit())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_space(ch: u32) -> bool {
is_space(ch)
|| matches!(
ch,
0x0009
| 0x000A
| 0x000B
| 0x000C
| 0x000D
| 0x001C
| 0x001D
| 0x001E
| 0x001F
| 0x0020
| 0x0085
| 0x00A0
| 0x1680
| 0x2000
| 0x2001
| 0x2002
| 0x2003
| 0x2004
| 0x2005
| 0x2006
| 0x2007
| 0x2008
| 0x2009
| 0x200A
| 0x2028
| 0x2029
| 0x202F
| 0x205F
| 0x3000
)
}
#[inline]
pub(crate) fn is_uni_linebreak(ch: u32) -> bool {
matches!(
ch,
0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029
)
}
#[inline]
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.unwrap_or(false)
}
#[inline]
pub(crate) fn is_uni_word(ch: u32) -> bool {
ch == '_' as u32 || is_uni_alnum(ch)
}
#[inline]
pub fn lower_unicode(ch: u32) -> u32 {
char::try_from(ch)
.map(|x| x.to_lowercase().next().unwrap() as u32)
.unwrap_or(ch)
}
#[inline]
pub fn upper_unicode(ch: u32) -> u32 {
char::try_from(ch)
.map(|x| x.to_uppercase().next().unwrap() as u32)
.unwrap_or(ch)
}