plotnik_compiler/emit/
regex_table.rs

1//! Regex table builder for bytecode emission.
2//!
3//! Compiles regex patterns to sparse DFAs and builds the regex blob/table sections.
4//! Each entry stores both the pattern's StringId (for display) and DFA offset (for matching).
5
6use std::collections::HashMap;
7
8use regex_automata::dfa::dense;
9use regex_automata::dfa::sparse::DFA;
10
11use plotnik_bytecode::StringId;
12
13use super::EmitError;
14
15/// Compiled regex entry with pattern reference and serialized DFA.
16#[derive(Debug)]
17struct RegexEntry {
18    /// StringId of the pattern (for display in dump/trace).
19    string_id: StringId,
20    /// Serialized sparse DFA bytes.
21    dfa_bytes: Vec<u8>,
22}
23
24/// Builds the regex table, compiling patterns to sparse DFAs.
25///
26/// Each regex is compiled to a sparse DFA and serialized. The table stores
27/// both StringId (for pattern display) and offset into the blob (for DFA access).
28///
29/// Index 0 is unused (regex_id 0 means "no regex").
30#[derive(Debug, Default)]
31pub struct RegexTableBuilder {
32    /// Map from StringId to regex ID (for deduplication).
33    lookup: HashMap<StringId, u16>,
34    /// Compiled regex entries (index 0 is unused).
35    entries: Vec<Option<RegexEntry>>,
36}
37
38impl RegexTableBuilder {
39    pub fn new() -> Self {
40        Self {
41            lookup: HashMap::new(),
42            entries: vec![None], // index 0 reserved
43        }
44    }
45
46    /// Intern a regex pattern, compiling it to a DFA.
47    ///
48    /// Takes the pattern string and its StringId. Returns the regex ID on success.
49    pub fn intern(&mut self, pattern: &str, string_id: StringId) -> Result<u16, EmitError> {
50        if let Some(&id) = self.lookup.get(&string_id) {
51            return Ok(id);
52        }
53
54        // Compile to dense DFA first, then convert to sparse
55        let dense = dense::DFA::builder()
56            .configure(
57                dense::DFA::config()
58                    .start_kind(regex_automata::dfa::StartKind::Unanchored)
59                    .minimize(true),
60            )
61            .build(pattern)
62            .map_err(|e| EmitError::RegexCompile(pattern.to_string(), e.to_string()))?;
63
64        let sparse = dense
65            .to_sparse()
66            .map_err(|e| EmitError::RegexCompile(pattern.to_string(), e.to_string()))?;
67
68        let dfa_bytes = sparse.to_bytes_little_endian();
69
70        let id = self.entries.len() as u16;
71        if id == u16::MAX {
72            return Err(EmitError::TooManyRegexes(self.entries.len()));
73        }
74
75        self.entries.push(Some(RegexEntry {
76            string_id,
77            dfa_bytes,
78        }));
79        self.lookup.insert(string_id, id);
80        Ok(id)
81    }
82
83    /// Number of regexes (including reserved index 0).
84    pub fn len(&self) -> usize {
85        self.entries.len()
86    }
87
88    /// Whether the builder has any regexes (beyond reserved index 0).
89    pub fn is_empty(&self) -> bool {
90        self.entries.len() <= 1
91    }
92
93    /// Validate that the regex count fits in u16.
94    pub fn validate(&self) -> Result<(), EmitError> {
95        if self.entries.len() > 65535 {
96            return Err(EmitError::TooManyRegexes(self.entries.len()));
97        }
98        Ok(())
99    }
100
101    /// Lookup a regex ID by StringId.
102    pub fn get(&self, string_id: StringId) -> Option<u16> {
103        self.lookup.get(&string_id).copied()
104    }
105
106    /// Emit the regex blob and table.
107    ///
108    /// Returns (blob_bytes, table_bytes).
109    ///
110    /// Table format per entry: `string_id (u16) | reserved (u16) | offset (u32)`
111    /// This allows looking up both the pattern string (via StringTable) and DFA bytes.
112    pub fn emit(&self) -> (Vec<u8>, Vec<u8>) {
113        let mut blob = Vec::new();
114        let mut table = Vec::with_capacity(self.entries.len() * 8 + 4);
115
116        for entry in &self.entries {
117            // Pad blob to 4-byte alignment before each DFA
118            let rem = blob.len() % 4;
119            if rem != 0 {
120                blob.resize(blob.len() + (4 - rem), 0);
121            }
122
123            let (string_id, offset) = if let Some(e) = entry {
124                blob.extend_from_slice(&e.dfa_bytes);
125                (e.string_id.get(), (blob.len() - e.dfa_bytes.len()) as u32)
126            } else {
127                (0, 0)
128            };
129
130            // Emit table entry: string_id (u16) | reserved (u16) | offset (u32)
131            table.extend_from_slice(&string_id.to_le_bytes());
132            table.extend_from_slice(&0u16.to_le_bytes()); // reserved
133            table.extend_from_slice(&offset.to_le_bytes());
134        }
135
136        // Sentinel entry with blob end offset
137        table.extend_from_slice(&0u16.to_le_bytes()); // string_id = 0
138        table.extend_from_slice(&0u16.to_le_bytes()); // reserved
139        table.extend_from_slice(&(blob.len() as u32).to_le_bytes());
140
141        (blob, table)
142    }
143}
144
145/// Deserialize a sparse DFA from bytecode.
146///
147/// # Safety
148/// The bytes must have been produced by `DFA::to_bytes_little_endian()`.
149pub fn deserialize_dfa(bytes: &[u8]) -> Result<DFA<&[u8]>, String> {
150    // SAFETY: We only serialize DFAs we built, and the format is stable
151    // within the same regex-automata version.
152    DFA::from_bytes(bytes)
153        .map(|(dfa, _)| dfa)
154        .map_err(|e| e.to_string())
155}