use memchr::{memchr, memchr2, memchr3};
#[derive(Debug, Clone)]
pub struct FastScanner<'a> {
bytes: &'a [u8],
len: usize,
}
impl<'a> FastScanner<'a> {
pub fn new(bytes: &'a [u8]) -> Self {
Self {
bytes,
len: bytes.len(),
}
}
#[inline]
pub fn len(&self) -> usize {
self.len
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
#[inline]
pub fn skip_whitespace(&self, mut pos: usize) -> usize {
while pos < self.len {
match self.bytes[pos] {
b' ' | b'\t' | b'\n' | b'\r' => pos += 1,
_ => break,
}
}
pos
}
#[inline]
pub fn skip_whitespace_and_comments(&self, mut pos: usize) -> usize {
loop {
pos = self.skip_whitespace(pos);
if pos < self.len && self.bytes[pos] == b'#' {
pos = self.find_line_end(pos).unwrap_or(self.len);
} else {
break;
}
}
pos
}
#[inline]
pub fn find_byte(&self, byte: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr(byte, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_either_byte(&self, byte1: u8, byte2: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr2(byte1, byte2, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_any_of_three(&self, byte1: u8, byte2: u8, byte3: u8, pos: usize) -> Option<usize> {
if pos >= self.len {
return None;
}
memchr3(byte1, byte2, byte3, &self.bytes[pos..]).map(|offset| pos + offset)
}
#[inline]
pub fn find_line_end(&self, pos: usize) -> Option<usize> {
self.find_either_byte(b'\n', b'\r', pos)
}
pub fn scan_until_delimiter(&self, mut pos: usize) -> usize {
while pos < self.len {
match self.bytes[pos] {
b' ' | b'\t' | b'\n' | b'\r' | b'<' | b'>' | b'@' | b';' | b',' | b'.' | b'['
| b']' | b'(' | b')' | b'{' | b'}' | b'#' | b'"' | b'\'' => break,
_ => pos += 1,
}
}
pos
}
pub fn scan_string_literal(&self, start: usize, quote: u8) -> Option<usize> {
let mut pos = start + 1;
while pos < self.len {
match self.bytes[pos] {
b'\\' => {
pos += 2;
}
byte if byte == quote => {
return Some(pos + 1);
}
_ => {
pos += 1;
}
}
}
None }
pub fn scan_long_string_literal(&self, start: usize, quote: u8) -> Option<usize> {
let mut pos = start + 3;
while pos + 2 < self.len {
if self.bytes[pos] == quote
&& self.bytes[pos + 1] == quote
&& self.bytes[pos + 2] == quote
{
return Some(pos + 3);
}
if self.bytes[pos] == b'\\' {
pos += 2; } else {
pos += 1;
}
}
None }
#[inline]
pub fn is_long_string_start(&self, pos: usize, quote: u8) -> bool {
pos + 2 < self.len
&& self.bytes[pos] == quote
&& self.bytes[pos + 1] == quote
&& self.bytes[pos + 2] == quote
}
pub fn scan_iri_ref(&self, start: usize) -> Option<usize> {
let mut pos = start + 1;
while pos < self.len {
match self.bytes[pos] {
b'>' => return Some(pos + 1),
b'\\' => pos += 2, b'\n' | b'\r' => return None, _ => pos += 1,
}
}
None }
#[inline]
pub fn is_pn_chars_base(byte: u8) -> bool {
matches!(byte,
b'A'..=b'Z' | b'a'..=b'z' |
0xC0..=0xD6 | 0xD8..=0xF6 | 0xF8..=0xFF
)
}
#[inline]
pub fn is_pn_chars(byte: u8) -> bool {
matches!(byte,
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' |
b'-' | b'_' | b'.' |
0xC0..=0xD6 | 0xD8..=0xF6 | 0xF8..=0xFF
)
}
pub fn scan_prefixed_name(&self, start: usize) -> Option<(usize, usize)> {
let mut pos = start;
let prefix_end = if pos < self.len && Self::is_pn_chars_base(self.bytes[pos]) {
pos += 1;
while pos < self.len && Self::is_pn_chars(self.bytes[pos]) {
pos += 1;
}
pos
} else {
pos
};
if pos >= self.len || self.bytes[pos] != b':' {
return None;
}
pos += 1;
let _local_start = pos;
if pos < self.len && (Self::is_pn_chars_base(self.bytes[pos]) || self.bytes[pos] == b'_') {
pos += 1;
while pos < self.len && Self::is_pn_chars(self.bytes[pos]) {
pos += 1;
}
}
Some((prefix_end, pos))
}
#[inline]
pub fn slice(&self, start: usize, end: usize) -> &'a [u8] {
&self.bytes[start..end.min(self.len)]
}
#[inline]
pub fn byte_at(&self, pos: usize) -> Option<u8> {
if pos < self.len {
Some(self.bytes[pos])
} else {
None
}
}
pub fn count_lines(&self, pos: usize) -> usize {
self.bytes[..pos.min(self.len)]
.iter()
.filter(|&&b| b == b'\n')
.count()
+ 1
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_skip_whitespace() {
let input = " hello";
let scanner = FastScanner::new(input.as_bytes());
let pos = scanner.skip_whitespace(0);
assert_eq!(pos, 3);
assert_eq!(scanner.byte_at(pos), Some(b'h'));
}
#[test]
fn test_skip_whitespace_mixed() {
let input = " \t\n hello";
let scanner = FastScanner::new(input.as_bytes());
let pos = scanner.skip_whitespace(0);
assert_eq!(pos, 6);
}
#[test]
fn test_find_byte() {
let input = "hello world";
let scanner = FastScanner::new(input.as_bytes());
assert_eq!(scanner.find_byte(b'w', 0), Some(6));
assert_eq!(scanner.find_byte(b'o', 0), Some(4));
assert_eq!(scanner.find_byte(b'z', 0), None);
}
#[test]
fn test_find_either_byte() {
let input = "hello world";
let scanner = FastScanner::new(input.as_bytes());
assert_eq!(scanner.find_either_byte(b'w', b'o', 0), Some(4)); assert_eq!(scanner.find_either_byte(b'x', b'y', 0), None);
}
#[test]
fn test_scan_until_delimiter() {
let input = "prefix:local <iri>";
let scanner = FastScanner::new(input.as_bytes());
let pos = scanner.scan_until_delimiter(0);
assert_eq!(pos, 12); assert_eq!(
std::str::from_utf8(scanner.slice(0, pos)).expect("valid UTF-8"),
"prefix:local"
);
}
#[test]
fn test_scan_string_literal() {
let input = r#""hello world" more"#;
let scanner = FastScanner::new(input.as_bytes());
let end = scanner.scan_string_literal(0, b'"');
assert_eq!(end, Some(13)); }
#[test]
fn test_scan_string_with_escape() {
let input = r#""hello \"world\"" more"#;
let scanner = FastScanner::new(input.as_bytes());
let end = scanner.scan_string_literal(0, b'"');
assert_eq!(end, Some(17));
}
#[test]
fn test_scan_long_string() {
let input = r#""""hello
world""" more"#;
let scanner = FastScanner::new(input.as_bytes());
let end = scanner.scan_long_string_literal(0, b'"');
assert_eq!(end, Some(17)); }
#[test]
fn test_scan_iri_ref() {
let input = "<http://example.org/> more";
let scanner = FastScanner::new(input.as_bytes());
let end = scanner.scan_iri_ref(0);
assert_eq!(end, Some(21));
}
#[test]
fn test_scan_iri_ref_unterminated() {
let input = "<http://example.org/";
let scanner = FastScanner::new(input.as_bytes());
let end = scanner.scan_iri_ref(0);
assert_eq!(end, None);
}
#[test]
fn test_scan_prefixed_name() {
let input = "prefix:local ";
let scanner = FastScanner::new(input.as_bytes());
let result = scanner.scan_prefixed_name(0);
assert_eq!(result, Some((6, 12))); }
#[test]
fn test_scan_prefixed_name_no_local() {
let input = "prefix: ";
let scanner = FastScanner::new(input.as_bytes());
let result = scanner.scan_prefixed_name(0);
assert_eq!(result, Some((6, 7))); }
#[test]
fn test_skip_whitespace_and_comments() {
let input = " # comment\n hello";
let scanner = FastScanner::new(input.as_bytes());
let pos = scanner.skip_whitespace_and_comments(0);
assert_eq!(pos, 14); assert_eq!(scanner.byte_at(pos), Some(b'h'));
}
#[test]
fn test_is_long_string_start() {
let scanner = FastScanner::new(br#"""""#);
assert!(scanner.is_long_string_start(0, b'"'));
let scanner2 = FastScanner::new(br#""hello"#);
assert!(!scanner2.is_long_string_start(0, b'"'));
}
#[test]
fn test_count_lines() {
let input = "line1\nline2\nline3";
let scanner = FastScanner::new(input.as_bytes());
assert_eq!(scanner.count_lines(0), 1);
assert_eq!(scanner.count_lines(6), 2); assert_eq!(scanner.count_lines(12), 3); }
#[test]
fn test_find_line_end() {
let input = "hello world\nmore text";
let scanner = FastScanner::new(input.as_bytes());
assert_eq!(scanner.find_line_end(0), Some(11));
}
#[test]
fn test_empty_scanner() {
let scanner = FastScanner::new(&[]);
assert!(scanner.is_empty());
assert_eq!(scanner.len(), 0);
assert_eq!(scanner.skip_whitespace(0), 0);
assert_eq!(scanner.find_byte(b'x', 0), None);
}
}