use std::collections::HashMap;
use regex_automata::dfa::dense;
use regex_automata::dfa::sparse::DFA;
use plotnik_bytecode::StringId;
use super::EmitError;
#[derive(Debug)]
struct RegexEntry {
string_id: StringId,
dfa_bytes: Vec<u8>,
}
#[derive(Debug, Default)]
pub struct RegexTableBuilder {
lookup: HashMap<StringId, u16>,
entries: Vec<Option<RegexEntry>>,
}
impl RegexTableBuilder {
pub fn new() -> Self {
Self {
lookup: HashMap::new(),
entries: vec![None], }
}
pub fn intern(&mut self, pattern: &str, string_id: StringId) -> Result<u16, EmitError> {
if let Some(&id) = self.lookup.get(&string_id) {
return Ok(id);
}
let dense = dense::DFA::builder()
.configure(
dense::DFA::config()
.start_kind(regex_automata::dfa::StartKind::Unanchored)
.minimize(true),
)
.build(pattern)
.map_err(|e| EmitError::RegexCompile(pattern.to_string(), e.to_string()))?;
let sparse = dense
.to_sparse()
.map_err(|e| EmitError::RegexCompile(pattern.to_string(), e.to_string()))?;
let dfa_bytes = sparse.to_bytes_little_endian();
let id = self.entries.len() as u16;
if id == u16::MAX {
return Err(EmitError::TooManyRegexes(self.entries.len()));
}
self.entries.push(Some(RegexEntry {
string_id,
dfa_bytes,
}));
self.lookup.insert(string_id, id);
Ok(id)
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.len() <= 1
}
pub fn validate(&self) -> Result<(), EmitError> {
if self.entries.len() > 65535 {
return Err(EmitError::TooManyRegexes(self.entries.len()));
}
Ok(())
}
pub fn get(&self, string_id: StringId) -> Option<u16> {
self.lookup.get(&string_id).copied()
}
pub fn emit(&self) -> (Vec<u8>, Vec<u8>) {
let mut blob = Vec::new();
let mut table = Vec::with_capacity(self.entries.len() * 8 + 4);
for entry in &self.entries {
let rem = blob.len() % 4;
if rem != 0 {
blob.resize(blob.len() + (4 - rem), 0);
}
let (string_id, offset) = if let Some(e) = entry {
blob.extend_from_slice(&e.dfa_bytes);
(e.string_id.get(), (blob.len() - e.dfa_bytes.len()) as u32)
} else {
(0, 0)
};
table.extend_from_slice(&string_id.to_le_bytes());
table.extend_from_slice(&0u16.to_le_bytes()); table.extend_from_slice(&offset.to_le_bytes());
}
table.extend_from_slice(&0u16.to_le_bytes()); table.extend_from_slice(&0u16.to_le_bytes()); table.extend_from_slice(&(blob.len() as u32).to_le_bytes());
(blob, table)
}
}
pub fn deserialize_dfa(bytes: &[u8]) -> Result<DFA<&[u8]>, String> {
DFA::from_bytes(bytes)
.map(|(dfa, _)| dfa)
.map_err(|e| e.to_string())
}