lexigram_lib/lexergen/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::fmt::Display;
7use std::fs::File;
8use std::io::{BufWriter, Read, Write};
9use iter_index::IndexerIterator;
10use lexigram_core::CollectJoin;
11#[cfg(test)]
12use crate::dfa::print_graph;
13use crate::{indent_source, Normalized, SymbolTable, TokenId};
14use crate::char_reader::escape_char;
15use crate::lexer::{ActionOption, Lexer, StateId, Terminal};
16use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
17use crate::build::{BuildError, BuildErrorSource, BuildFrom, HasBuildErrorSource, TryBuildFrom};
18use crate::segments::Segments;
19use crate::segmap::{char_to_group, GroupId, Seg, SegMap};
20use super::dfa::*;
21
22// ---------------------------------------------------------------------------------------------
23
24/// Tables and parameters used to create a Lexer. This type is used as a return object from the lexer generator,
25/// when the Lexer must be created dynamically; for example, in tests or in situations where the lexicon isn't
26/// known in advance. In those situations, the LexerTables object must live as long as the lexer.
27///
28/// The Lexer itself only uses references to tables whenever possible because, in most situations, the tables are
29/// static in generated source files. Only the dictionaries must be created dynamically from (possibly) static
30/// tables because they don't exist in static form (yet).
31pub struct LexerTables {
32    // parameters
33    nbr_groups: u32,
34    initial_state: StateId,
35    first_end_state: StateId,   // accepting when state >= first_end_state
36    nbr_states: StateId,        // error if state >= nbr_states
37    // tables
38    ascii_to_group: Vec<GroupId>,
39    utf8_to_group: HashMap<char, GroupId>,
40    seg_to_group: SegMap<GroupId>,
41    state_table: Vec<StateId>,
42    terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
43}
44
45impl LexerTables {
46    pub fn new(
47        // parameters
48        nbr_groups: u32,
49        initial_state: StateId,
50        first_end_state: StateId,   // accepting when state >= first_end_state
51        nbr_states: StateId,        // error if state >= nbr_states
52        // tables
53        ascii_to_group: Vec<GroupId>,
54        utf8_to_group: HashMap<char, GroupId>,
55        seg_to_group: SegMap<GroupId>,
56        state_table: Vec<StateId>,
57        terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
58    ) -> Self {
59        LexerTables {
60            nbr_groups,
61            initial_state,
62            first_end_state,
63            nbr_states,
64            ascii_to_group,
65            utf8_to_group,
66            seg_to_group,
67            state_table,
68            terminal_table,
69        }
70    }
71
72    pub fn make_lexer<R: Read>(&self) -> Lexer<'_, R> {
73        Lexer::new(
74            self.nbr_groups,
75            self.initial_state,
76            self.first_end_state,
77            self.nbr_states,
78            self.ascii_to_group.as_slice(),
79            self.utf8_to_group.clone(),
80            self.seg_to_group.clone(),
81            self.state_table.as_slice(),
82            self.terminal_table.as_slice(),
83        )
84    }
85}
86
87impl BuildFrom<LexerGen> for LexerTables {
88    fn build_from(lexer_gen: LexerGen) -> LexerTables {
89        assert!(!lexer_gen.state_table.is_empty(), "tables are not built");
90        LexerTables::new(
91            lexer_gen.nbr_groups,
92            lexer_gen.initial_state,
93            lexer_gen.first_end_state,
94            lexer_gen.nbr_states,
95            lexer_gen.ascii_to_group,
96            lexer_gen.utf8_to_group,
97            lexer_gen.seg_to_group,
98            lexer_gen.state_table,
99            lexer_gen.terminal_table,
100        )
101    }
102}
103
104// not generated automatically since LexerTables isn't LogReader
105impl TryBuildFrom<LexerGen> for LexerTables {
106    type Error = BuildError;
107
108    fn try_build_from(source: LexerGen) -> Result<Self, Self::Error> {
109        if source.get_log().has_no_errors() {
110            Ok(LexerTables::build_from(source))
111        } else {
112            Err(BuildError::new(source.give_log(), BuildErrorSource::LexerGen))
113        }
114    }
115}
116
117// ---------------------------------------------------------------------------------------------
118
119/// Name of the crate used in the generated code
120#[derive(Clone, Default, PartialEq, Debug)]
121pub enum LexigramCrate {
122    /// [LexigramCrate] is [lexigram_core] (default)
123    #[default]
124    Core,
125    /// [LexigramCrate] is [lexigram_lib](crate)
126    Full,
127    /// [LexigramCrate] is a custom name; e.g. if the use renames it in `use lexigram_core as core`.
128    Custom(String),
129}
130
131impl Display for LexigramCrate {
132    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
133        write!(f, "{}", match self {
134            LexigramCrate::Core => "lexigram_core",
135            LexigramCrate::Full => "lexigram_lib",
136            LexigramCrate::Custom(s) => s.as_str(),
137        })
138    }
139}
140
141pub struct LexerGen {
142    // parameters:
143    pub max_utf8_chars: u32,
144    pub nbr_groups: u32,
145    pub initial_state: StateId,
146    pub first_end_state: StateId,   // accepting when state >= first_end_state
147    pub nbr_states: StateId,        // error if state >= nbr_states
148    // tables:
149    pub ascii_to_group: Vec<GroupId>,
150    pub utf8_to_group: HashMap<char, GroupId>,
151    pub seg_to_group: SegMap<GroupId>,
152    pub state_table: Vec<StateId>,
153    pub terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
154    pub symbol_table: Option<SymbolTable>,
155    // internal
156    log: BufLog,
157    group_partition: Segments,   // for optimization
158    headers: Vec<String>,
159    lib_crate: LexigramCrate,
160}
161
162impl LexerGen {
163    pub const DEFAULT_UTF8_TABLE_SIZE: u32 = 128;
164
165    fn new() -> Self {
166        LexerGen {
167            max_utf8_chars: Self::DEFAULT_UTF8_TABLE_SIZE,
168            nbr_groups: 0,
169            initial_state: 0,
170            first_end_state: 0,
171            nbr_states: 0,
172            ascii_to_group: vec![GroupId::MAX; 128],
173            utf8_to_group: HashMap::default(),
174            seg_to_group: SegMap::new(),
175            state_table: Vec::new(),
176            terminal_table: Vec::new(),
177            symbol_table: None,
178            log: BufLog::new(),
179            group_partition: Segments::empty(),
180            headers: Vec::new(),
181            lib_crate: LexigramCrate::Core,
182        }
183    }
184
185    #[inline]
186    pub fn add_header<T: Into<String>>(&mut self, header: T) {
187        self.headers.push(header.into());
188    }
189
190    #[inline]
191    pub fn extend_headers<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, headers: I) {
192        self.headers.extend(headers.into_iter().map(|s| s.into()));
193    }
194
195    #[inline]
196    pub fn use_full_lib(&mut self, use_full_lib: bool) {
197        self.lib_crate = if use_full_lib { LexigramCrate::Full } else { LexigramCrate::Core };
198    }
199
200    #[inline]
201    pub fn set_crate(&mut self, lcrate: LexigramCrate) {
202        self.lib_crate = lcrate;
203    }
204
205    pub fn build_from_dfa(dfa: Dfa<Normalized>, max_utf8_chars: u32) -> Self {
206        let mut lexergen = Self::new();
207        lexergen.max_utf8_chars = max_utf8_chars;
208        lexergen.make_from_dfa(dfa);
209        lexergen
210    }
211
212    fn make_from_dfa(&mut self, mut dfa: Dfa<Normalized>) {
213        self.log.extend(std::mem::replace(&mut dfa.log, BufLog::new()));
214        self.log.add_note("creating lexer from DFA...");
215        self.create_input_tables(&dfa);
216        self.create_state_tables(&dfa);
217    }
218
219    fn create_input_tables(&mut self, dfa: &Dfa<Normalized>) {
220        /// Max continuous segment size translated to individual UTF8 entries. This prevents large segments
221        /// like DOT from cluttering the dictionary.
222        const MAX_UTF8_SEG_RANGE: u32 = 16;
223        const VERBOSE: bool = false;
224        let symbol_part = partition_symbols(dfa.get_state_graph());
225        let symbol_to_group = SegMap::from_iter(
226            symbol_part.iter().index().flat_map(|(id, i)| i.iter().map(move |ab| (*ab, id)))
227        );
228        self.group_partition = Segments::from_iter(symbol_to_group.keys().cloned());
229
230        if VERBOSE {
231            println!("symbol partition:{}\ntables:", symbol_to_group.iter()
232                .map(|(seg, g)| format!("\n- {seg} -> {g}")).collect::<String>());
233        }
234        self.nbr_groups = symbol_part.len() as GroupId;
235        let error_id = self.nbr_groups as GroupId;
236        self.ascii_to_group.fill(error_id);
237        self.utf8_to_group.clear();
238        self.seg_to_group.clear();
239        let mut left = self.max_utf8_chars;
240        for (seg, group_id) in symbol_to_group {
241            if VERBOSE { println!("Seg: {}-{}", seg.0, seg.1); }
242            if seg.0 < 128 {
243                if VERBOSE {
244                    println!("- ASCII: {}-{} ({}-{}) => {group_id}",
245                             escape_char(char::from_u32(seg.0).unwrap()), escape_char(char::from_u32(seg.1.min(127)).unwrap()),
246                             seg.0, seg.1.min(127));
247                }
248                for b in seg.0..=seg.1.min(127) {
249                    self.ascii_to_group[b as usize] = group_id;
250                }
251            }
252            if seg.1 >= 128 {
253                let mut low = 128.max(seg.0);
254                let high = seg.1.min(low + left - 1);
255                if seg.1 - low < MAX_UTF8_SEG_RANGE {
256                    if left > 0 {
257                        for u in low..=high {
258                            if VERBOSE { println!("- UTF8: {} ({u}) => {group_id}", escape_char(char::from_u32(u).unwrap())); }
259                            self.utf8_to_group.insert(char::from_u32(u).unwrap(), group_id);
260                        }
261                        left -= 1 + high - low;
262                    }
263                    low = high + 1;
264                }
265                if low <= seg.1 {
266                    if VERBOSE {
267                        println!("- SEG: {}-{} ({}-{}) => {group_id}",
268                             escape_char(char::from_u32(high + 1).unwrap()), escape_char(char::from_u32(seg.1).unwrap()),
269                             low, seg.1);
270                    }
271                    self.seg_to_group.insert(Seg(low, seg.1), group_id);
272                }
273            }
274        }
275        self.log.add_note(format!(
276            "- creating input tables: ASCII {} entries, UTF8 {} entries, segments {} entries",
277            self.ascii_to_group.len(), self.utf8_to_group.len(), self.seg_to_group.len()));
278    }
279
280    fn create_state_tables(&mut self, dfa: &Dfa<Normalized>) {
281        const VERBOSE: bool = false;
282        self.initial_state = dfa.get_initial_state().unwrap();
283        self.first_end_state = dfa.get_first_end_state().unwrap();
284        self.nbr_states = dfa.get_state_graph().len();
285        let nbr_states = dfa.get_state_graph().len();
286        // we add one extra table index to allow for the 'error group', which equals nbr_group:
287        // state_table[nbr_state * nbr_group + nbr_group] must exist; the content will be ignored.
288        let mut state_table = vec!(self.nbr_states; self.nbr_groups as usize * nbr_states + 1);
289        for (state_from, trans) in dfa.get_state_graph() {
290            if VERBOSE { println!("state {state_from}"); }
291            for (segments, state_to) in trans {
292                if VERBOSE { println!("- {segments} -> state {state_to}"); }
293                let mut segments_part = segments.clone();
294                segments_part.slice_partitions(&self.group_partition);
295                for seg in segments_part.iter() {
296                    let symbol = char::from_u32(seg.0).unwrap();
297                    let symbol_group = char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, symbol).unwrap_or(self.nbr_groups);
298                    state_table[self.nbr_groups as usize * state_from + symbol_group as usize] = *state_to;
299                }
300            }
301        }
302        self.state_table = state_table;
303        let terminal_table = dfa.get_end_states().iter()
304            .filter_map(|(&st, t)| if st >= self.first_end_state { Some(t.clone()) } else { None })
305            .to_vec();
306        self.terminal_table = terminal_table;
307        let max_token_maybe = self.terminal_table.iter().fold(None, { |acc, t|
308            if let Terminal { action: ActionOption::Token(tok), .. } = t {
309                Some(acc.unwrap_or(0).max(*tok))
310            } else {
311                acc
312            }
313        });
314        self.log.add_note(format!(
315            "- creating state tables: state table {} entries, terminal table {} entries",
316            self.state_table.len(), self.terminal_table.len()));
317        match max_token_maybe {
318            Some(max_token) => {
319                if max_token == TokenId::MAX {
320                    self.log.add_error(format!("  the token {} is taken, but it's reserved for illegal characters", TokenId::MAX));
321                }
322            }
323            None => {
324                self.log.add_error("  the lexer returns no tokens");
325            }
326        }
327    }
328    
329    pub fn write_source_code(&self, file: Option<File>, indent: usize) -> Result<(), std::io::Error> {
330        let mut out: BufWriter<Box<dyn Write>> = match file {
331            Some(file) => BufWriter::new(Box::new(file)),
332            None => BufWriter::new(Box::new(std::io::stdout().lock()))
333        };
334        let source = self.gen_source_code(indent);
335        out.write(source.as_bytes())?;
336        // write!(out, "{source}");
337        Ok(())
338    }
339
340    pub fn gen_source_code(&self, indent: usize) -> String {
341        indent_source(vec![self.lexer_source_code()], indent)
342    }
343
344    pub fn try_gen_source_code(self, indent: usize) -> Result<(BufLog, String), BuildError> {
345        let src = self.gen_source_code(indent);
346        if self.log.has_no_errors() {
347            Ok((self.give_log(), src))
348        } else {
349            Err(BuildError::new(self.give_log(), BuildErrorSource::LexerGen))
350        }
351    }
352
353    fn lexer_source_code(&self) -> Vec<String> {
354        let mut source = self.headers.clone();
355
356        if !self.headers.is_empty() {
357            source.push(String::new());
358        }
359
360        // Create source code:
361        source.push(format!("use std::collections::HashMap;"));
362        source.push(format!("use std::io::Read;"));
363        source.push(format!("use {}::lexer::{{ActionOption, Lexer, ModeOption, StateId, Terminal}};", self.lib_crate));
364        source.push(format!("use {}::segmap::{{GroupId, Seg, SegMap}};", self.lib_crate));
365        source.push(String::new());
366        source.push(format!("const NBR_GROUPS: u32 = {};", self.nbr_groups));
367        source.push(format!("const INITIAL_STATE: StateId = {};", self.initial_state));
368        source.push(format!("const FIRST_END_STATE: StateId = {};", self.first_end_state));
369        source.push(format!("const NBR_STATES: StateId = {};", self.nbr_states));
370        let mut groups = vec![BTreeSet::new(); self.nbr_groups as usize];
371        source.push(format!("static ASCII_TO_GROUP: [GroupId; 128] = ["));
372        for i in 0..8_usize {
373            let mut s = "    ".to_string();
374            for j in 0..16_usize {
375                let ascii = i * 16 + j;
376                let group = self.ascii_to_group[i * 16 + j];
377                s.push_str(&format!("{:3}, ", group));
378                if group < self.nbr_groups {
379                    groups[group as usize].insert(char::from(ascii as u8));
380                }
381            }
382            source.push(format!("{s}  // {}-{}", i*16, i*16 + 15));
383        }
384        source.push(format!("];"));
385        source.push(format!("static UTF8_TO_GROUP: [(char, GroupId); {}] = [", self.utf8_to_group.len()));
386        let mut hashmap_content = self.utf8_to_group.iter().map(|(c, g)| format!("    ('{}', {}),", escape_char(*c), g)).to_vec();
387        hashmap_content.sort(); // the sources must be identical from one run to the next
388        source.extend(hashmap_content);
389        source.push(format!("];"));
390        /*
391        for (c, g) in lexergen.utf8_to_group.iter() {
392            groups[*g as usize].insert(*c);
393        }
394        for (g, chars) in groups.iter().enumerate() {
395            if !chars.is_empty() {
396                let set = chars.iter().map(|c| escape_char(*c)).collect::<String>();
397                source.push(format!("// group[{g:3}] = [{set}]"));
398            };
399        }
400        */
401        source.push(format!("static SEG_TO_GROUP: [(Seg, GroupId); {}] = [", self.seg_to_group.len()));
402        for (s, g) in &self.seg_to_group {
403            source.push(format!("    (Seg({}, {}), {}),", s.0, s.1, g));
404        }
405        source.push(format!("];"));
406        source.push(format!("static TERMINAL_TABLE: [Terminal;{}] = [", self.terminal_table.len()));
407        for t in &self.terminal_table {
408            // Terminal { action: TermAction::Skip, channel: 0, mode: ModeOption::None, mode_state: None, pop: false },
409            source.push(format!("    Terminal {{ action: ActionOption::{:?}, channel: {}, mode: ModeOption::{:?}, mode_state: {:?}, pop: {} }},",
410                                t.action, t.channel, t.mode, t.mode_state, t.pop
411            ));
412        }
413        source.push(format!("];"));
414        source.push(format!("static STATE_TABLE: [StateId; {}] = [", self.state_table.len()));
415        for i in 0..self.nbr_states as usize {
416            source.push(format!("    {}, // state {}{}",
417                (0..self.nbr_groups as usize).map(|j| format!("{:3}", self.state_table[i * self.nbr_groups as usize + j])).join(", "),
418                i,
419                if i >= self.first_end_state { format!(" {}", self.terminal_table[i - self.first_end_state] ) } else { "".to_string() }
420            ));
421        }
422        source.push(format!("    {:3} // error group in [nbr_state * nbr_group + nbr_group]", self.state_table[self.state_table.len() - 1]));
423        source.push(format!("];"));
424        source.push(String::new());
425        source.push(format!("pub fn build_lexer<R: Read>() -> Lexer<'static, R> {{"));
426        source.push(format!("    Lexer::new("));
427        source.push(format!("        // parameters"));
428        source.push(format!("        NBR_GROUPS,"));
429        source.push(format!("        INITIAL_STATE,"));
430        source.push(format!("        FIRST_END_STATE,"));
431        source.push(format!("        NBR_STATES,"));
432        source.push(format!("        // tables"));
433        source.push(format!("        &ASCII_TO_GROUP,"));
434        source.push(format!("        HashMap::<char, GroupId>::from(UTF8_TO_GROUP),"));
435        source.push(format!("        SegMap::<GroupId>::from(SEG_TO_GROUP),"));
436        source.push(format!("        &STATE_TABLE,"));
437        source.push(format!("        &TERMINAL_TABLE,"));
438        source.push(format!("    )"));
439        source.push(format!("}}"));
440        source
441    }
442}
443
444impl LogReader for LexerGen {
445    type Item = BufLog;
446
447    fn get_log(&self) -> &Self::Item {
448        &self.log
449    }
450
451    fn give_log(self) -> Self::Item {
452        self.log
453    }
454}
455
456impl HasBuildErrorSource for LexerGen {
457    const SOURCE: BuildErrorSource = BuildErrorSource::LexerGen;
458}
459
460impl BuildFrom<Dfa<Normalized>> for LexerGen {
461    fn build_from(dfa: Dfa<Normalized>) -> Self {
462        let mut lexgen = LexerGen::new();
463        lexgen.make_from_dfa(dfa);
464        lexgen
465    }
466}
467
468// ---------------------------------------------------------------------------------------------
469// Supporting functions
470
471// todo: option to split ASCII range?
472fn partition_symbols(g: &BTreeMap<StateId, BTreeMap<Segments, StateId>>) -> Vec<Segments> {
473    const VERBOSE: bool = false;
474    let mut groups = Vec::new();
475    #[cfg(test)] if VERBOSE { print_graph(g, None, 4); }
476    for (_state, branches) in g {
477        // branches from a given state
478        let mut map = BTreeMap::<StateId, Segments>::new();
479        for (segments, destination) in branches {
480            if let Some(i) = map.get_mut(destination) {
481                i.extend(&mut segments.iter());
482            } else {
483                map.insert(*destination, segments.clone());
484            }
485        }
486        // optimizes the segments, in case it's not already done
487        for segments in map.values_mut() {
488            segments.normalize();
489        }
490        #[cfg(test)] if VERBOSE { println!("{_state} => {}", map.values().map(|i| format!("{i:X}")).join(", ")); }
491        let mut state_sub = map.into_values().collect::<BTreeSet<Segments>>();
492        while let Some(mut sub) = state_sub.pop_first() {
493            if VERBOSE { println!("- sub = {sub}"); }
494            for i in 0..groups.len() {
495                if VERBOSE { println!("  - groups[{i}] = {}", groups[i]); }
496                let cmp = sub.intersect(&groups[i]);
497                if cmp.common.is_empty() {
498                    if VERBOSE { println!("    (disjoints)"); }
499                } else {
500                    // groups[i] is split { cmp.common, cmp.external }
501                    groups[i] = cmp.common;
502                    if !cmp.external.is_empty() {
503                        if VERBOSE { println!("    -> push {}", cmp.external); }
504                        groups.push(cmp.external);
505                    }
506                    // sub is split { cmp.common, cmp.internal }, and we discard cmp.common since already in groups
507                    if VERBOSE { println!("    -> sub = {}", cmp.internal); }
508                    sub = cmp.internal;
509                    if sub.is_empty() {
510                        break;
511                    }
512                }
513            }
514            if !sub.is_empty() {
515                groups.push(sub);
516            }
517        }
518    }
519    #[cfg(test)] if VERBOSE { println!("=> {}", groups.iter().map(|i| format!("{i:X}")).join(", ")); }
520    groups
521}