use memchr::{memchr, memchr2, memchr3};
#[derive(Debug, Clone)]
pub struct SimdLexer<'a> {
bytes: &'a [u8],
len: usize,
}
impl<'a> SimdLexer<'a> {
#[inline]
pub fn new(bytes: &'a [u8]) -> Self {
Self {
bytes,
len: bytes.len(),
}
}
#[inline]
pub fn len(&self) -> usize {
self.len
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
#[inline]
pub fn skip_whitespace(&self, mut pos: usize) -> usize {
if self.len - pos < 16 {
return self.skip_whitespace_scalar(pos);
}
while pos < self.len {
match self.bytes[pos] {
b' ' | b'\t' | b'\n' | b'\r' => pos += 1,
_ => break,
}
}
pos
}
#[inline]
fn skip_whitespace_scalar(&self, mut pos: usize) -> usize {
while pos < self.len {
match self.bytes[pos] {
b' ' | b'\t' | b'\n' | b'\r' => pos += 1,
_ => break,
}
}
pos
}
#[inline]
pub fn skip_whitespace_and_comments(&self, mut pos: usize) -> usize {
loop {
pos = self.skip_whitespace(pos);
if pos < self.len && self.bytes[pos] == b'#' {
pos = self.find_line_end(pos).unwrap_or(self.len);
} else {
break;
}
}
pos
}
#[inline]
pub fn find_byte(&self, byte: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr(byte, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_either_byte(&self, byte1: u8, byte2: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr2(byte1, byte2, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_any_of_three(&self, byte1: u8, byte2: u8, byte3: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr3(byte1, byte2, byte3, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_line_end(&self, pos: usize) -> Option<usize> {
self.find_either_byte(b'\n', b'\r', pos)
}
#[inline]
pub fn scan_until_whitespace(&self, mut pos: usize) -> usize {
while pos < self.len {
match self.bytes[pos] {
b' ' | b'\t' | b'\n' | b'\r' => break,
_ => pos += 1,
}
}
pos
}
pub fn scan_until_delimiter(&self, mut pos: usize) -> usize {
while pos < self.len {
let byte = self.bytes[pos];
if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b':' || byte == b'-' {
pos += 1;
continue;
}
match byte {
b' ' | b'\t' | b'\n' | b'\r' | b'<' | b'>' | b'@' | b';' | b',' | b'.' | b'['
| b']' | b'(' | b')' | b'{' | b'}' | b'#' | b'"' | b'\'' => break,
_ => pos += 1,
}
}
pos
}
pub fn scan_iri_ref(&self, start: usize) -> Option<usize> {
if start >= self.len || self.bytes[start] != b'<' {
return None;
}
let mut pos = start + 1;
while pos < self.len {
match self.bytes[pos] {
b'>' => return Some(pos + 1),
b'\\' => pos += 2, b'\n' | b'\r' => return None, _ => pos += 1,
}
}
None }
pub fn scan_string_literal(&self, start: usize, quote: u8) -> Option<usize> {
if start >= self.len || self.bytes[start] != quote {
return None;
}
let mut pos = start + 1;
while pos < self.len {
match self.bytes[pos] {
b'\\' => pos += 2, byte if byte == quote => return Some(pos + 1),
_ => pos += 1,
}
}
None }
pub fn scan_long_string_literal(&self, start: usize, quote: u8) -> Option<usize> {
if start + 2 >= self.len {
return None;
}
if self.bytes[start] != quote
|| self.bytes[start + 1] != quote
|| self.bytes[start + 2] != quote
{
return None;
}
let mut pos = start + 3;
while pos + 2 < self.len {
if self.bytes[pos] == quote
&& self.bytes[pos + 1] == quote
&& self.bytes[pos + 2] == quote
{
return Some(pos + 3);
}
if self.bytes[pos] == b'\\' {
pos += 2; } else {
pos += 1;
}
}
None }
#[inline]
pub fn is_long_string_start(&self, pos: usize, quote: u8) -> bool {
pos + 2 < self.len
&& self.bytes[pos] == quote
&& self.bytes[pos + 1] == quote
&& self.bytes[pos + 2] == quote
}
#[inline]
pub fn byte_at(&self, pos: usize) -> Option<u8> {
if pos < self.len {
Some(self.bytes[pos])
} else {
None
}
}
#[inline]
pub fn slice(&self, start: usize, end: usize) -> &'a [u8] {
&self.bytes[start..end.min(self.len)]
}
pub fn count_lines(&self, pos: usize) -> usize {
let slice = &self.bytes[..pos.min(self.len)];
let mut count = 1;
let mut search_pos = 0;
while let Some(newline_pos) = memchr(b'\n', &slice[search_pos..]) {
count += 1;
search_pos += newline_pos + 1;
}
count
}
pub fn count_byte(&self, byte: u8, start: usize, end: usize) -> usize {
let slice = &self.bytes[start..end.min(self.len)];
let mut count = 0;
let mut pos = 0;
while let Some(offset) = memchr(byte, &slice[pos..]) {
count += 1;
pos += offset + 1;
}
count
}
#[inline]
pub fn validate_utf8(&self, start: usize, end: usize) -> bool {
std::str::from_utf8(&self.bytes[start..end.min(self.len)]).is_ok()
}
}
#[derive(Debug, Clone, Default)]
pub struct SimdStats {
pub simd_ops: usize,
pub scalar_ops: usize,
pub bytes_processed: usize,
}
impl SimdStats {
pub fn simd_percentage(&self) -> f64 {
let total = self.simd_ops + self.scalar_ops;
if total == 0 {
0.0
} else {
(self.simd_ops as f64 / total as f64) * 100.0
}
}
pub fn report(&self) -> String {
format!(
"SIMD Lexer Statistics:\n\
- SIMD operations: {}\n\
- Scalar operations: {}\n\
- SIMD usage: {:.1}%\n\
- Bytes processed: {}",
self.simd_ops,
self.scalar_ops,
self.simd_percentage(),
self.bytes_processed
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_skip_whitespace() {
let input = " hello";
let lexer = SimdLexer::new(input.as_bytes());
let pos = lexer.skip_whitespace(0);
assert_eq!(pos, 3);
assert_eq!(lexer.byte_at(pos), Some(b'h'));
}
#[test]
fn test_skip_whitespace_long() {
let input = format!("{}hello", " ".repeat(100));
let lexer = SimdLexer::new(input.as_bytes());
let pos = lexer.skip_whitespace(0);
assert_eq!(pos, 100);
}
#[test]
fn test_find_byte_simd() {
let input = "hello world, this is a test";
let lexer = SimdLexer::new(input.as_bytes());
assert_eq!(lexer.find_byte(b'w', 0), Some(6));
assert_eq!(lexer.find_byte(b',', 0), Some(11));
assert_eq!(lexer.find_byte(b'z', 0), None);
}
#[test]
fn test_find_either_byte() {
let input = "hello world";
let lexer = SimdLexer::new(input.as_bytes());
assert_eq!(lexer.find_either_byte(b'w', b'o', 0), Some(4));
}
#[test]
fn test_scan_until_delimiter() {
let input = "prefix:local <iri>";
let lexer = SimdLexer::new(input.as_bytes());
let pos = lexer.scan_until_delimiter(0);
assert_eq!(pos, 12); assert_eq!(
std::str::from_utf8(lexer.slice(0, pos)).expect("valid UTF-8"),
"prefix:local"
);
}
#[test]
fn test_scan_iri_ref() {
let input = "<http://example.org/> more";
let lexer = SimdLexer::new(input.as_bytes());
let end = lexer.scan_iri_ref(0);
assert_eq!(end, Some(21));
}
#[test]
fn test_scan_string_literal() {
let input = r#""hello world" more"#;
let lexer = SimdLexer::new(input.as_bytes());
let end = lexer.scan_string_literal(0, b'"');
assert_eq!(end, Some(13));
}
#[test]
fn test_scan_long_string() {
let input = r#""""hello
world""" more"#;
let lexer = SimdLexer::new(input.as_bytes());
let end = lexer.scan_long_string_literal(0, b'"');
assert_eq!(end, Some(17));
}
#[test]
fn test_count_lines_simd() {
let input = "line1\nline2\nline3\nline4";
let lexer = SimdLexer::new(input.as_bytes());
assert_eq!(lexer.count_lines(0), 1);
assert_eq!(lexer.count_lines(6), 2);
assert_eq!(lexer.count_lines(12), 3);
assert_eq!(lexer.count_lines(input.len()), 4);
}
#[test]
fn test_count_byte() {
let input = "a,b,c,d,e";
let lexer = SimdLexer::new(input.as_bytes());
assert_eq!(lexer.count_byte(b',', 0, input.len()), 4);
assert_eq!(lexer.count_byte(b'a', 0, input.len()), 1);
assert_eq!(lexer.count_byte(b'z', 0, input.len()), 0);
}
#[test]
fn test_validate_utf8() {
let valid = "hello 世界";
let lexer = SimdLexer::new(valid.as_bytes());
assert!(lexer.validate_utf8(0, valid.len()));
let invalid = &[0xFF, 0xFE, 0xFD];
let lexer = SimdLexer::new(invalid);
assert!(!lexer.validate_utf8(0, invalid.len()));
}
#[test]
fn test_skip_whitespace_and_comments() {
let input = " # comment\n hello";
let lexer = SimdLexer::new(input.as_bytes());
let pos = lexer.skip_whitespace_and_comments(0);
assert_eq!(pos, 14); assert_eq!(lexer.byte_at(pos), Some(b'h'));
}
#[test]
fn test_is_long_string_start() {
let lexer = SimdLexer::new(br#"""""#);
assert!(lexer.is_long_string_start(0, b'"'));
let lexer2 = SimdLexer::new(br#""hello"#);
assert!(!lexer2.is_long_string_start(0, b'"'));
}
#[test]
fn test_simd_stats() {
let stats = SimdStats {
simd_ops: 80,
scalar_ops: 20,
bytes_processed: 10000,
};
assert_eq!(stats.simd_percentage(), 80.0);
let report = stats.report();
assert!(report.contains("80.0%"));
assert!(report.contains("10000"));
}
}