use std::fmt::{Display, Formatter};
use std::mem::size_of;
use std::num::TryFromIntError;
use bitvec::order::Lsb0;
use bitvec::slice::{BitSlice, IterOnes};
pub const OPCODE_PREFIX: u8 = 0xAA;
pub type NumAlt = u8;
#[derive(Debug, Default, Copy, Clone)]
pub struct SplitId(u16);
impl From<SplitId> for usize {
fn from(value: SplitId) -> Self {
value.0 as Self
}
}
impl Display for SplitId {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl SplitId {
pub const BITS: usize = 13;
pub const MAX: usize = (1 << SplitId::BITS) - 1;
#[inline]
pub fn to_le_bytes(self) -> [u8; size_of::<Self>()] {
self.0.to_le_bytes()
}
#[inline]
pub fn from_le_bytes(bytes: [u8; size_of::<Self>()]) -> Self {
Self(u16::from_le_bytes(bytes))
}
#[inline]
pub fn add(self, amount: u16) -> Option<Self> {
let sum = self.0.checked_add(amount)?;
if sum as usize > Self::MAX {
return None;
}
Some(Self(sum))
}
}
pub struct Offset(i32);
impl From<Offset> for i32 {
#[inline]
fn from(value: Offset) -> Self {
value.0
}
}
impl From<Offset> for isize {
#[inline]
fn from(value: Offset) -> Self {
value.0 as isize
}
}
impl From<usize> for Offset {
#[inline]
fn from(value: usize) -> Self {
Self(i32::try_from(value).unwrap())
}
}
impl TryFrom<isize> for Offset {
type Error = TryFromIntError;
#[inline]
fn try_from(value: isize) -> Result<Self, Self::Error> {
Ok(Self(i32::try_from(value)?))
}
}
impl Offset {
#[inline]
pub fn from_le_bytes(bytes: [u8; size_of::<Self>()]) -> Offset {
Offset(i32::from_le_bytes(bytes))
}
#[inline]
pub fn to_le_bytes(&self) -> [u8; size_of::<Self>()] {
self.0.to_le_bytes()
}
}
pub enum Instr<'a> {
Match,
AnyByte,
Byte(u8),
CaseInsensitiveChar(u8),
MaskedByte { byte: u8, mask: u8 },
ClassBitmap(ClassBitmap<'a>),
ClassRanges(ClassRanges<'a>),
SplitA(SplitId, Offset),
SplitB(SplitId, Offset),
SplitN(SplitN<'a>),
RepeatGreedy { offset: Offset, min: u32, max: u32 },
RepeatNonGreedy { offset: Offset, min: u32, max: u32 },
Jump(Offset),
Start,
End,
LineStart,
LineEnd,
WordBoundary,
WordBoundaryNeg,
WordStart,
WordEnd,
}
impl Instr<'_> {
pub const MATCH: u8 = 0x00;
pub const SPLIT_A: u8 = 0x01;
pub const SPLIT_B: u8 = 0x02;
pub const SPLIT_N: u8 = 0x03;
pub const JUMP: u8 = 0x04;
pub const ANY_BYTE: u8 = 0x05;
pub const MASKED_BYTE: u8 = 0x06;
pub const CASE_INSENSITIVE_CHAR: u8 = 0x07;
pub const CLASS_BITMAP: u8 = 0x08;
pub const CLASS_RANGES: u8 = 0x09;
pub const START: u8 = 0x0A;
pub const END: u8 = 0x0B;
pub const WORD_BOUNDARY: u8 = 0x0C;
pub const WORD_BOUNDARY_NEG: u8 = 0x0D;
pub const WORD_START: u8 = 0x0E;
pub const WORD_END: u8 = 0x0F;
pub const REPEAT_GREEDY: u8 = 0x10;
pub const REPEAT_NON_GREEDY: u8 = 0x11;
pub const LINE_START: u8 = 0x12;
pub const LINE_END: u8 = 0x13;
}
pub(crate) struct InstrParser<'a> {
code: &'a [u8],
addr: usize,
}
impl<'a> InstrParser<'a> {
pub fn new(code: &'a [u8]) -> Self {
Self { code, addr: 0 }
}
#[inline(always)]
pub fn decode_instr(code: &[u8]) -> (Instr<'_>, usize) {
match code[..] {
[OPCODE_PREFIX, Instr::ANY_BYTE, ..] => (Instr::AnyByte, 2),
[OPCODE_PREFIX, Instr::MASKED_BYTE, byte, mask, ..] => {
(Instr::MaskedByte { byte, mask }, 4)
}
[OPCODE_PREFIX, Instr::CASE_INSENSITIVE_CHAR, byte, ..] => {
(Instr::CaseInsensitiveChar(byte), 3)
}
[OPCODE_PREFIX, Instr::JUMP, ..] => {
let offset = Self::decode_offset(&code[2..]);
(Instr::Jump(offset), 2 + size_of::<Offset>())
}
[OPCODE_PREFIX, Instr::SPLIT_A, ..] => {
let id = Self::decode_split_id(&code[2..]);
let offset =
Self::decode_offset(&code[2 + size_of::<SplitId>()..]);
(
Instr::SplitA(id, offset),
2 + size_of::<SplitId>() + size_of::<Offset>(),
)
}
[OPCODE_PREFIX, Instr::SPLIT_B, ..] => {
let id = Self::decode_split_id(&code[2..]);
let offset =
Self::decode_offset(&code[2 + size_of::<SplitId>()..]);
(
Instr::SplitB(id, offset),
2 + size_of::<SplitId>() + size_of::<Offset>(),
)
}
[OPCODE_PREFIX, Instr::SPLIT_N, ..] => {
let id = Self::decode_split_id(&code[2..]);
let n =
Self::decode_num_alt(&code[2 + size_of::<SplitId>()..]);
let offsets =
&code[2 + size_of::<SplitId>() + size_of::<NumAlt>()
..2 + size_of::<SplitId>()
+ size_of::<NumAlt>()
+ size_of::<Offset>() * n as usize];
(
Instr::SplitN(SplitN(id, offsets)),
2 + size_of::<SplitId>()
+ size_of::<NumAlt>()
+ size_of::<Offset>() * n as usize,
)
}
[OPCODE_PREFIX, Instr::REPEAT_GREEDY, ..] => {
let offset = Self::decode_offset(&code[2..]);
let min = Self::decode_u32(&code[2 + size_of::<Offset>()..]);
let max = Self::decode_u32(
&code[2 + size_of::<Offset>() + size_of::<u32>()..],
);
(
Instr::RepeatGreedy { offset, min, max },
2 + size_of::<Offset>()
+ size_of::<u32>()
+ size_of::<u32>(),
)
}
[OPCODE_PREFIX, Instr::REPEAT_NON_GREEDY, ..] => {
let offset = Self::decode_offset(&code[2..]);
let min = Self::decode_u32(&code[2 + size_of::<Offset>()..]);
let max = Self::decode_u32(
&code[2 + size_of::<Offset>() + size_of::<u32>()..],
);
(
Instr::RepeatNonGreedy { offset, min, max },
2 + size_of::<Offset>()
+ size_of::<u32>()
+ size_of::<u32>(),
)
}
[OPCODE_PREFIX, Instr::CLASS_RANGES, ..] => {
let n = *unsafe { code.get_unchecked(2) } as usize;
let ranges =
unsafe { code.get_unchecked(3..3 + size_of::<i16>() * n) };
(
Instr::ClassRanges(ClassRanges(ranges)),
3 + size_of::<i16>() * n,
)
}
[OPCODE_PREFIX, Instr::CLASS_BITMAP, ..] => {
let bitmap = &code[2..2 + 32];
(Instr::ClassBitmap(ClassBitmap(bitmap)), 2 + bitmap.len())
}
[OPCODE_PREFIX, Instr::START, ..] => (Instr::Start, 2),
[OPCODE_PREFIX, Instr::END, ..] => (Instr::End, 2),
[OPCODE_PREFIX, Instr::LINE_START, ..] => (Instr::LineStart, 2),
[OPCODE_PREFIX, Instr::LINE_END, ..] => (Instr::LineEnd, 2),
[OPCODE_PREFIX, Instr::WORD_BOUNDARY, ..] => {
(Instr::WordBoundary, 2)
}
[OPCODE_PREFIX, Instr::WORD_BOUNDARY_NEG, ..] => {
(Instr::WordBoundaryNeg, 2)
}
[OPCODE_PREFIX, Instr::WORD_START, ..] => (Instr::WordStart, 2),
[OPCODE_PREFIX, Instr::WORD_END, ..] => (Instr::WordEnd, 2),
[OPCODE_PREFIX, Instr::MATCH, ..] => (Instr::Match, 2),
[OPCODE_PREFIX, OPCODE_PREFIX, ..] => {
(Instr::Byte(OPCODE_PREFIX), 2)
}
[b, ..] => (Instr::Byte(b), 1),
_ => unreachable!(),
}
}
fn decode_u32(slice: &[u8]) -> u32 {
let bytes: &[u8; size_of::<u32>()] =
unsafe { &*(slice.as_ptr() as *const [u8; size_of::<u32>()]) };
u32::from_le_bytes(*bytes)
}
fn decode_offset(slice: &[u8]) -> Offset {
let bytes: &[u8; size_of::<Offset>()] =
unsafe { &*(slice.as_ptr() as *const [u8; size_of::<Offset>()]) };
Offset::from_le_bytes(*bytes)
}
fn decode_num_alt(slice: &[u8]) -> NumAlt {
let bytes: &[u8; size_of::<NumAlt>()] =
unsafe { &*(slice.as_ptr() as *const [u8; size_of::<NumAlt>()]) };
NumAlt::from_le_bytes(*bytes)
}
fn decode_split_id(slice: &[u8]) -> SplitId {
let bytes: &[u8; size_of::<SplitId>()] =
unsafe { &*(slice.as_ptr() as *const [u8; size_of::<SplitId>()]) };
SplitId::from_le_bytes(*bytes)
}
}
impl<'a> Iterator for InstrParser<'a> {
type Item = (Instr<'a>, usize);
fn next(&mut self) -> Option<Self::Item> {
if self.code.is_empty() {
return None;
}
let (instr, size) = InstrParser::decode_instr(self.code);
let addr = self.addr;
self.addr += size;
self.code = &self.code[size..];
Some((instr, addr))
}
}
pub struct SplitN<'a>(SplitId, &'a [u8]);
impl<'a> SplitN<'a> {
#[inline]
pub fn id(&self) -> SplitId {
self.0
}
#[inline]
pub fn offsets(&self) -> SplitOffsets<'a> {
SplitOffsets(self.1)
}
}
pub struct SplitOffsets<'a>(&'a [u8]);
impl Iterator for SplitOffsets<'_> {
type Item = Offset;
fn next(&mut self) -> Option<Self::Item> {
if self.0.len() < size_of::<Offset>() {
return None;
}
let next = Offset::from_le_bytes(
(&self.0[..size_of::<Offset>()]).try_into().unwrap(),
);
self.0 = &self.0[size_of::<Offset>()..];
Some(next)
}
}
impl DoubleEndedIterator for SplitOffsets<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
let len = self.0.len();
if len < size_of::<Offset>() {
return None;
}
let next = Offset::from_le_bytes(
(&self.0[len - size_of::<Offset>()..len]).try_into().unwrap(),
);
self.0 = &self.0[..len - size_of::<Offset>()];
Some(next)
}
}
pub struct ClassRanges<'a>(&'a [u8]);
impl<'a> ClassRanges<'a> {
pub fn ranges(&self) -> Ranges<'a> {
Ranges(self.0)
}
pub fn contains(&self, byte: u8) -> bool {
for range in self.ranges() {
if (range.0..=range.1).contains(&byte) {
return true;
}
}
false
}
}
pub struct Ranges<'a>(&'a [u8]);
impl Iterator for Ranges<'_> {
type Item = (u8, u8);
fn next(&mut self) -> Option<Self::Item> {
if self.0.len() < 2 {
return None;
}
let start = self.0[0];
let end = self.0[1];
self.0 = &self.0[2..];
Some((start, end))
}
}
pub struct ClassBitmap<'a>(&'a [u8]);
impl<'a> ClassBitmap<'a> {
pub fn bytes(&self) -> IterOnes<'a, u8, Lsb0> {
BitSlice::<_, Lsb0>::from_slice(self.0).iter_ones()
}
pub fn contains(&self, byte: u8) -> bool {
unsafe {
*BitSlice::<_, Lsb0>::from_slice(self.0)
.get_unchecked(byte as usize)
}
}
}
pub fn literal_code_length(literal: &[u8]) -> usize {
let mut length = literal.len();
for b in literal {
if *b == OPCODE_PREFIX {
length += 1;
}
}
length
}