#![cfg_attr(feature = "strict_docs", allow(missing_docs))]
use std::collections::HashMap;
use std::ffi::c_void;
pub const MAX_EXTERNAL_SCANNER_STATE_LENGTH: usize = 32;
pub trait ExternalScanner: Send + Sync {
fn scan(&mut self, lexer: &mut Lexer, valid_symbols: &[bool]) -> bool;
fn serialize(&self, _buffer: &mut [u8]) -> usize {
0 }
fn deserialize(&mut self, _buffer: &[u8]) {
}
}
pub struct Lexer<'a> {
input: &'a [u8],
position: usize,
token_end: usize,
result_symbol: u16,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a [u8], position: usize) -> Self {
Lexer {
input,
position,
token_end: position,
result_symbol: 0,
}
}
pub fn advance(&mut self, skip: bool) -> Option<u8> {
if self.position < self.input.len() {
let ch = self.input[self.position];
self.position += 1;
if !skip {
self.token_end = self.position;
}
Some(ch)
} else {
None
}
}
pub fn skip_whitespace(&mut self) {
while self.position < self.input.len() {
match self.input[self.position] {
b' ' | b'\t' | b'\r' | b'\n' => {
self.position += 1;
}
_ => break,
}
}
self.token_end = self.position;
}
pub fn mark_end(&mut self) {
self.token_end = self.position;
}
pub fn get_column(&self) -> usize {
let mut column = 0;
for i in (0..self.position).rev() {
if self.input[i] == b'\n' {
break;
}
column += 1;
}
column
}
pub fn eof(&self) -> bool {
self.position >= self.input.len()
}
pub fn lookahead(&self) -> Option<u8> {
if self.position < self.input.len() {
Some(self.input[self.position])
} else {
None
}
}
pub fn result(&mut self, symbol: u16) {
self.result_symbol = symbol;
}
pub fn token_length(&self) -> usize {
self.token_end - (self.position - self.token_end)
}
}
pub struct ExternalScannerRegistry {
scanners: HashMap<String, Box<dyn ExternalScanner>>,
}
impl Default for ExternalScannerRegistry {
fn default() -> Self {
Self::new()
}
}
impl ExternalScannerRegistry {
pub fn new() -> Self {
ExternalScannerRegistry {
scanners: HashMap::new(),
}
}
pub fn register(&mut self, name: String, scanner: Box<dyn ExternalScanner>) {
self.scanners.insert(name, scanner);
}
pub fn get(&self, name: &str) -> Option<&dyn ExternalScanner> {
self.scanners.get(name).map(|s| s.as_ref())
}
pub fn get_mut(&mut self, name: &str) -> Option<&mut (dyn ExternalScanner + 'static)> {
self.scanners.get_mut(name).map(|s| s.as_mut())
}
}
pub mod ffi {
use super::*;
use std::slice;
pub unsafe extern "C" fn external_scanner_create() -> *mut c_void {
let registry = Box::new(ExternalScannerRegistry::new());
Box::into_raw(registry) as *mut c_void
}
pub unsafe extern "C" fn external_scanner_destroy(scanner: *mut c_void) {
if !scanner.is_null() {
unsafe {
let _ = Box::from_raw(scanner as *mut ExternalScannerRegistry);
}
}
}
pub unsafe extern "C" fn external_scanner_scan(
scanner: *mut c_void,
lexer: *mut c_void,
valid_symbols: *const bool,
valid_symbol_count: u32,
) -> bool {
if scanner.is_null() || lexer.is_null() || valid_symbols.is_null() {
return false;
}
let _registry = unsafe { &mut *(scanner as *mut ExternalScannerRegistry) };
let _valid_symbols =
unsafe { slice::from_raw_parts(valid_symbols, valid_symbol_count as usize) };
false
}
pub unsafe extern "C" fn external_scanner_serialize(
scanner: *mut c_void,
buffer: *mut u8,
buffer_size: u32,
) -> u32 {
if scanner.is_null() || buffer.is_null() {
return 0;
}
let _registry = unsafe { &*(scanner as *mut ExternalScannerRegistry) };
let _buffer = unsafe { slice::from_raw_parts_mut(buffer, buffer_size as usize) };
0
}
pub unsafe extern "C" fn external_scanner_deserialize(
scanner: *mut c_void,
buffer: *const u8,
length: u32,
) {
if scanner.is_null() || buffer.is_null() {
return;
}
let _registry = unsafe { &mut *(scanner as *mut ExternalScannerRegistry) };
let _buffer = unsafe { slice::from_raw_parts(buffer, length as usize) };
}
}
pub struct StringScanner {
delimiter: Option<String>,
}
impl Default for StringScanner {
fn default() -> Self {
Self::new()
}
}
impl StringScanner {
pub fn new() -> Self {
StringScanner { delimiter: None }
}
}
impl ExternalScanner for StringScanner {
fn scan(&mut self, lexer: &mut Lexer, valid_symbols: &[bool]) -> bool {
const STRING_START: usize = 0;
const STRING_CONTENT: usize = 1;
const STRING_END: usize = 2;
if valid_symbols.get(STRING_START).copied().unwrap_or(false) {
if lexer.lookahead() == Some(b'"') {
lexer.advance(false);
if lexer.lookahead() == Some(b'"') {
lexer.advance(false);
if lexer.lookahead() == Some(b'"') {
lexer.advance(false);
lexer.mark_end();
lexer.result(STRING_START as u16);
self.delimiter = Some("\"\"\"".to_string());
return true;
}
}
}
}
if valid_symbols.get(STRING_END).copied().unwrap_or(false)
&& let Some(delim) = &self.delimiter
{
let delim_bytes = delim.as_bytes();
let mut matched = true;
for &b in delim_bytes {
if lexer.lookahead() != Some(b) {
matched = false;
break;
}
lexer.advance(false);
}
if matched {
lexer.mark_end();
lexer.result(STRING_END as u16);
self.delimiter = None;
return true;
}
}
if valid_symbols.get(STRING_CONTENT).copied().unwrap_or(false) && self.delimiter.is_some() {
while !lexer.eof() {
if lexer.lookahead() == Some(b'"') {
break;
}
lexer.advance(false);
}
if lexer.token_length() > 0 {
lexer.result(STRING_CONTENT as u16);
return true;
}
}
false
}
fn serialize(&self, buffer: &mut [u8]) -> usize {
if let Some(delim) = &self.delimiter {
let bytes = delim.as_bytes();
let len = bytes.len().min(buffer.len());
buffer[..len].copy_from_slice(&bytes[..len]);
len
} else {
0
}
}
fn deserialize(&mut self, buffer: &[u8]) {
if !buffer.is_empty() {
self.delimiter = Some(String::from_utf8_lossy(buffer).to_string());
} else {
self.delimiter = None;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lexer_advance() {
let input = b"hello world";
let mut lexer = Lexer::new(input, 0);
assert_eq!(lexer.advance(false), Some(b'h'));
assert_eq!(lexer.advance(false), Some(b'e'));
assert_eq!(lexer.position, 2);
assert_eq!(lexer.token_end, 2);
}
#[test]
fn test_lexer_skip() {
let input = b" \t\nhello";
let mut lexer = Lexer::new(input, 0);
lexer.skip_whitespace();
assert_eq!(lexer.position, 4);
assert_eq!(lexer.lookahead(), Some(b'h'));
}
#[test]
fn test_string_scanner() {
let mut scanner = StringScanner::new();
let input = b"\"\"\"hello world\"\"\"";
let mut lexer = Lexer::new(input, 0);
let valid_symbols = vec![true, false, false]; assert!(scanner.scan(&mut lexer, &valid_symbols));
assert_eq!(lexer.result_symbol, 0);
let mut buffer = vec![0u8; 32];
let len = scanner.serialize(&mut buffer);
assert_eq!(&buffer[..len], b"\"\"\"");
}
}