use std::sync::Arc;
use log::{debug, trace};
use crate::{Match, Result, ScannerMode, ScannerModeSwitcher};
use super::{
compiled_scanner_mode::CompiledScannerMode, CharClassID, CharacterClassRegistry, TerminalIDBase,
};
#[derive(Clone)]
pub(crate) struct ScannerImpl {
pub(crate) character_classes: Arc<CharacterClassRegistry>,
pub(crate) scanner_modes: Vec<CompiledScannerMode>,
pub(crate) match_char_class: Arc<dyn (Fn(CharClassID, char) -> bool) + 'static + Send + Sync>,
current_mode: usize,
}
impl ScannerImpl {
#[inline]
fn execute_possible_mode_switch(&mut self, current_match: &Match) {
let current_mode = &self.scanner_modes[self.current_mode];
if let Some(next_mode) = current_mode.has_transition(current_match.token_type()) {
trace!(
"Switching from mode {} to mode {}",
self.current_mode,
next_mode
);
self.current_mode = next_mode;
}
}
#[allow(dead_code)]
pub(crate) fn create_match_char_class(
&self,
) -> Result<Box<dyn (Fn(CharClassID, char) -> bool) + 'static + Send + Sync>> {
self.character_classes.create_match_char_class()
}
pub(crate) fn reset(&mut self) {
self.current_mode = 0;
}
pub(crate) fn find_from(
&mut self,
input: &str,
char_indices: std::str::CharIndices,
) -> Option<crate::Match> {
if let Some(matched) = self.peek_from(input, char_indices) {
self.execute_possible_mode_switch(&matched);
return Some(matched);
}
None
}
pub(crate) fn peek_from(
&mut self,
input: &str,
char_indices: std::str::CharIndices,
) -> Option<crate::Match> {
let dfa = &mut self.scanner_modes[self.current_mode].dfa;
if let Some(matched) = dfa.find_from(input, char_indices, &*self.match_char_class) {
debug_assert!(
!matched.is_empty(),
r#"
An empty token was matched. This leads to an infinite loop.
It is therefore necessary to avoid regexes that can match empty tokens.
Please, check regex '{}' for token type {} in scanner mode {}"#,
dfa.pattern((matched.token_type() as TerminalIDBase).into())
.escape_default(),
matched.token_type(),
self.current_mode
);
return Some(matched);
}
None
}
pub(crate) fn has_transition(&self, token_type: usize) -> Option<usize> {
self.scanner_modes[self.current_mode].has_transition(token_type)
}
pub(crate) fn log_compiled_automata_as_dot(&self) -> crate::Result<()> {
use std::io::Read;
for (i, scanner_mode) in self.scanner_modes.iter().enumerate() {
debug!("Compiled DFA: Mode {} \n{}", i, {
let mut cursor = std::io::Cursor::new(Vec::new());
let title = format!("Compiled DFA {}", scanner_mode.name);
super::dot::compiled_dfa_render(
&scanner_mode.dfa,
&title,
&self.character_classes,
&mut cursor,
);
let mut dot_format = String::new();
cursor.set_position(0);
cursor.read_to_string(&mut dot_format)?;
dot_format
});
}
Ok(())
}
pub(crate) fn generate_compiled_automata_as_dot(
&self,
prefix: &str,
target_folder: &std::path::Path,
) -> crate::Result<()> {
use std::fs::File;
for scanner_mode in self.scanner_modes.iter() {
let title = format!("Compiled DFA {}", scanner_mode.name);
let file_name = format!(
"{}/{}_{}.dot",
target_folder.to_str().unwrap(),
prefix,
scanner_mode.name
);
let mut file = File::create(file_name)?;
super::dot::compiled_dfa_render(
&scanner_mode.dfa,
&title,
&self.character_classes,
&mut file,
);
}
Ok(())
}
}
impl ScannerModeSwitcher for ScannerImpl {
fn mode_name(&self, index: usize) -> Option<&str> {
self.scanner_modes.get(index).map(|mode| mode.name.as_str())
}
#[inline]
fn current_mode(&self) -> usize {
self.current_mode
}
#[inline]
fn set_mode(&mut self, mode: usize) {
self.current_mode = mode;
}
}
impl std::fmt::Debug for ScannerImpl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ScannerImpl")
.field("character_classes", &self.character_classes)
.field("scanner_modes", &self.scanner_modes)
.finish()
}
}
impl TryFrom<Vec<ScannerMode>> for ScannerImpl {
type Error = crate::ScnrError;
fn try_from(scanner_modes: Vec<ScannerMode>) -> Result<Self> {
let mut character_class_registry = CharacterClassRegistry::new();
let mut compiled_scanner_modes = Vec::with_capacity(scanner_modes.len());
for scanner_mode in scanner_modes {
let compiled_scanner_mode = CompiledScannerMode::try_from_scanner_mode(
scanner_mode,
&mut character_class_registry,
)?;
compiled_scanner_modes.push(compiled_scanner_mode);
}
let match_char_class = Arc::new(character_class_registry.create_match_char_class()?);
Ok(Self {
character_classes: Arc::new(character_class_registry),
scanner_modes: compiled_scanner_modes,
match_char_class,
current_mode: 0,
})
}
}
impl TryFrom<&[ScannerMode]> for ScannerImpl {
type Error = crate::ScnrError;
fn try_from(scanner_modes: &[ScannerMode]) -> Result<Self> {
let mut character_class_registry = CharacterClassRegistry::new();
let mut compiled_scanner_modes = Vec::with_capacity(scanner_modes.len());
for scanner_mode in scanner_modes {
let compiled_scanner_mode = CompiledScannerMode::try_from_scanner_mode(
scanner_mode.clone(),
&mut character_class_registry,
)?;
compiled_scanner_modes.push(compiled_scanner_mode);
}
let match_char_class = Arc::new(character_class_registry.create_match_char_class()?);
Ok(Self {
character_classes: Arc::new(character_class_registry),
scanner_modes: compiled_scanner_modes,
match_char_class,
current_mode: 0,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{Pattern, ScannerMode};
use std::{convert::TryInto, fs, path::Path, sync::Once};
static INIT: Once = Once::new();
const TARGET_FOLDER: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/target/testout/scanner_nfa_impl_test"
);
fn init() {
INIT.call_once(|| {
let _ = env_logger::builder().is_test(true).try_init();
let _ = fs::remove_dir_all(TARGET_FOLDER);
fs::create_dir_all(TARGET_FOLDER).unwrap();
});
}
#[test]
fn test_try_from() {
init();
let scanner_modes = vec![
ScannerMode::new("mode1", vec![Pattern::new("a".to_string(), 0)], vec![]),
ScannerMode::new("mode2", vec![Pattern::new("b".to_string(), 1)], vec![]),
];
let scanner_impl: ScannerImpl = scanner_modes.try_into().unwrap();
assert_eq!(scanner_impl.scanner_modes.len(), 2);
}
#[test]
fn test_match_char_class() {
init();
let scanner_modes = vec![
ScannerMode::new("mode1", vec![Pattern::new("a".to_string(), 0)], vec![]),
ScannerMode::new("mode2", vec![Pattern::new("b".to_string(), 1)], vec![]),
];
let scanner_impl: ScannerImpl = scanner_modes.try_into().unwrap();
let match_char_class = scanner_impl.create_match_char_class().unwrap();
assert!(match_char_class((0).into(), 'a'));
assert!(!match_char_class((0).into(), 'b'));
assert!(!match_char_class((0).into(), 'c'));
assert!(!match_char_class((1).into(), 'a'));
assert!(match_char_class((1).into(), 'b'));
assert!(!match_char_class((1).into(), 'c'));
}
#[test]
fn test_generate_dot_files() {
init();
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/string.json");
let file = fs::File::open(path).unwrap();
let scanner_modes: Vec<ScannerMode> = serde_json::from_reader(file)
.unwrap_or_else(|e| panic!("**** Failed to read json file {path}: {e}"));
let scanner_impl: ScannerImpl = scanner_modes.clone().try_into().unwrap();
scanner_impl
.generate_compiled_automata_as_dot("String", Path::new(TARGET_FOLDER))
.unwrap();
let dot_files: Vec<_> = fs::read_dir(TARGET_FOLDER)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
assert_eq!(dot_files.len(), 2);
assert_eq!(
dot_files
.iter()
.filter(|p| p.extension().unwrap() == "dot")
.count(),
2
);
assert_eq!(
dot_files
.iter()
.filter(|p| p.file_stem().unwrap().to_str().unwrap().contains("INITIAL"))
.count(),
1
);
assert_eq!(
dot_files
.iter()
.filter(|p| p.file_stem().unwrap().to_str().unwrap().contains("STRING"))
.count(),
1
);
}
}