Skip to main content

lexigram_lib/lexergen/
mod.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::fmt::Display;
7use std::fs::File;
8use std::io::{BufWriter, Read, Write};
9use iter_index::IndexerIterator;
10use lexigram_core::CollectJoin;
11#[cfg(test)]
12use crate::dfa::print_graph;
13use crate::{indent_source, Normalized, SymbolTable, TokenId};
14use crate::char_reader::escape_char;
15use crate::lexer::{ActionOption, Lexer, StateId, Terminal};
16use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
17use crate::build::{BuildError, BuildErrorSource, BuildFrom, HasBuildErrorSource, TryBuildFrom};
18use crate::segments::Segments;
19use crate::segmap::{char_to_group, GroupId, Seg, SegMap};
20use super::dfa::*;
21
22// ---------------------------------------------------------------------------------------------
23
24/// Tables and parameters used to create a Lexer. This type is used as a return object from the lexer generator,
25/// when the Lexer must be created dynamically; for example, in tests or in situations where the lexicon isn't
26/// known in advance. In those situations, the LexerTables object must live as long as the lexer.
27///
28/// The Lexer itself only uses references to tables whenever possible because, in most situations, the tables are
29/// static in generated source files. Only the dictionaries must be created dynamically from (possibly) static
30/// tables because they don't exist in static form (yet).
31pub struct LexerTables {
32    // parameters
33    nbr_groups: u32,
34    initial_state: StateId,
35    first_end_state: StateId,   // accepting when state >= first_end_state
36    nbr_states: StateId,        // error if state >= nbr_states
37    // tables
38    ascii_to_group: Vec<GroupId>,
39    utf8_to_group: HashMap<char, GroupId>,
40    seg_to_group: SegMap<GroupId>,
41    state_table: Vec<StateId>,
42    terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
43}
44
45impl LexerTables {
46    pub fn new(
47        // parameters
48        nbr_groups: u32,
49        initial_state: StateId,
50        first_end_state: StateId,   // accepting when state >= first_end_state
51        nbr_states: StateId,        // error if state >= nbr_states
52        // tables
53        ascii_to_group: Vec<GroupId>,
54        utf8_to_group: HashMap<char, GroupId>,
55        seg_to_group: SegMap<GroupId>,
56        state_table: Vec<StateId>,
57        terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
58    ) -> Self {
59        LexerTables {
60            nbr_groups,
61            initial_state,
62            first_end_state,
63            nbr_states,
64            ascii_to_group,
65            utf8_to_group,
66            seg_to_group,
67            state_table,
68            terminal_table,
69        }
70    }
71
72    pub fn make_lexer<R: Read>(&self) -> Lexer<'_, R> {
73        Lexer::new(
74            self.nbr_groups,
75            self.initial_state,
76            self.first_end_state,
77            self.nbr_states,
78            self.ascii_to_group.as_slice(),
79            self.utf8_to_group.clone(),
80            self.seg_to_group.clone(),
81            self.state_table.as_slice(),
82            self.terminal_table.as_slice(),
83        )
84    }
85}
86
87impl BuildFrom<LexerGen> for LexerTables {
88    fn build_from(lexer_gen: LexerGen) -> LexerTables {
89        assert!(!lexer_gen.state_table.is_empty(), "tables are not built");
90        LexerTables::new(
91            lexer_gen.nbr_groups,
92            lexer_gen.initial_state,
93            lexer_gen.first_end_state,
94            lexer_gen.nbr_states,
95            lexer_gen.ascii_to_group,
96            lexer_gen.utf8_to_group,
97            lexer_gen.seg_to_group,
98            lexer_gen.state_table,
99            lexer_gen.terminal_table,
100        )
101    }
102}
103
104// not generated automatically since LexerTables isn't LogReader
105impl TryBuildFrom<LexerGen> for LexerTables {
106    type Error = BuildError;
107
108    fn try_build_from(source: LexerGen) -> Result<Self, Self::Error> {
109        if source.get_log().has_no_errors() {
110            Ok(LexerTables::build_from(source))
111        } else {
112            Err(BuildError::new(source.give_log(), BuildErrorSource::LexerGen))
113        }
114    }
115}
116
117// ---------------------------------------------------------------------------------------------
118
119/// Name of the crate used in the generated code
120#[derive(Clone, Default, PartialEq, Debug)]
121pub enum LexigramCrate {
122    /// [LexigramCrate] is [lexigram_core] (default)
123    #[default]
124    Core,
125    /// [LexigramCrate] is [lexigram_lib](crate)
126    Full,
127    /// [LexigramCrate] is a custom name; e.g. if the use renames it in `use lexigram_core as core`.
128    Custom(String),
129}
130
131impl Display for LexigramCrate {
132    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
133        write!(f, "{}", match self {
134            LexigramCrate::Core => "lexigram_core",
135            LexigramCrate::Full => "lexigram_lib",
136            LexigramCrate::Custom(s) => s.as_str(),
137        })
138    }
139}
140
141#[derive(Clone, Debug)]
142pub struct LexerGenOptions {
143    pub headers: Vec<String>,
144    pub lib_crate: LexigramCrate,
145}
146
147pub struct LexerGen {
148    // parameters:
149    pub options: LexerGenOptions,
150    pub max_utf8_chars: u32,
151    pub nbr_groups: u32,
152    pub initial_state: StateId,
153    pub first_end_state: StateId,   // accepting when state >= first_end_state
154    pub nbr_states: StateId,        // error if state >= nbr_states
155    // tables:
156    pub ascii_to_group: Vec<GroupId>,
157    pub utf8_to_group: HashMap<char, GroupId>,
158    pub seg_to_group: SegMap<GroupId>,
159    pub state_table: Vec<StateId>,
160    pub terminal_table: Vec<Terminal>,  // token(state) = token_table[state - first_end_state]
161    pub symbol_table: Option<SymbolTable>,
162    // internal
163    log: BufLog,
164    group_partition: Segments,   // for optimization
165}
166
167impl LexerGen {
168    pub const DEFAULT_UTF8_TABLE_SIZE: u32 = 128;
169
170    fn new() -> Self {
171        LexerGen {
172            options: LexerGenOptions::default(),
173            max_utf8_chars: Self::DEFAULT_UTF8_TABLE_SIZE,
174            nbr_groups: 0,
175            initial_state: 0,
176            first_end_state: 0,
177            nbr_states: 0,
178            ascii_to_group: vec![GroupId::MAX; 128],
179            utf8_to_group: HashMap::default(),
180            seg_to_group: SegMap::new(),
181            state_table: Vec::new(),
182            terminal_table: Vec::new(),
183            symbol_table: None,
184            log: BufLog::new(),
185            group_partition: Segments::empty(),
186        }
187    }
188
189    pub fn set_options(&mut self, options: LexerGenOptions) {
190        self.options = options;
191    }
192
193    #[inline]
194    pub fn add_header<T: Into<String>>(&mut self, header: T) {
195        self.options.headers.push(header.into());
196    }
197
198    #[inline]
199    pub fn extend_headers<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, headers: I) {
200        self.options.headers.extend(headers.into_iter().map(|s| s.into()));
201    }
202
203    #[inline]
204    pub fn set_lib_crate(&mut self, lcrate: LexigramCrate) {
205        self.options.lib_crate = lcrate;
206    }
207
208    pub fn build_from_dfa(dfa: Dfa<Normalized>, max_utf8_chars: u32) -> Self {
209        let mut lexergen = Self::new();
210        lexergen.max_utf8_chars = max_utf8_chars;
211        lexergen.make_from_dfa(dfa);
212        lexergen
213    }
214
215    fn make_from_dfa(&mut self, mut dfa: Dfa<Normalized>) {
216        self.log.extend(std::mem::replace(&mut dfa.log, BufLog::new()));
217        self.log.add_note("creating lexer from DFA...");
218        self.create_input_tables(&dfa);
219        self.create_state_tables(&dfa);
220    }
221
222    fn create_input_tables(&mut self, dfa: &Dfa<Normalized>) {
223        /// Max continuous segment size translated to individual UTF8 entries. This prevents large segments
224        /// like DOT from cluttering the dictionary.
225        const MAX_UTF8_SEG_RANGE: u32 = 16;
226        const VERBOSE: bool = false;
227        let symbol_part = partition_symbols(dfa.get_state_graph());
228        let symbol_to_group = SegMap::from_iter(
229            symbol_part.iter().index().flat_map(|(id, i)| i.iter().map(move |ab| (*ab, id)))
230        );
231        self.group_partition = Segments::from_iter(symbol_to_group.keys().cloned());
232
233        if VERBOSE {
234            println!("symbol partition:{}\ntables:", symbol_to_group.iter()
235                .map(|(seg, g)| format!("\n- {seg} -> {g}")).collect::<String>());
236        }
237        self.nbr_groups = symbol_part.len() as GroupId;
238        let error_id = self.nbr_groups as GroupId;
239        self.ascii_to_group.fill(error_id);
240        self.utf8_to_group.clear();
241        self.seg_to_group.clear();
242        let mut left = self.max_utf8_chars;
243        for (seg, group_id) in symbol_to_group {
244            if VERBOSE { println!("Seg: {}-{}", seg.0, seg.1); }
245            if seg.0 < 128 {
246                if VERBOSE {
247                    println!("- ASCII: {}-{} ({}-{}) => {group_id}",
248                             escape_char(char::from_u32(seg.0).unwrap()), escape_char(char::from_u32(seg.1.min(127)).unwrap()),
249                             seg.0, seg.1.min(127));
250                }
251                for b in seg.0..=seg.1.min(127) {
252                    self.ascii_to_group[b as usize] = group_id;
253                }
254            }
255            if seg.1 >= 128 {
256                let mut low = 128.max(seg.0);
257                let high = seg.1.min(low + left - 1);
258                if seg.1 - low < MAX_UTF8_SEG_RANGE {
259                    if left > 0 {
260                        for u in low..=high {
261                            if VERBOSE { println!("- UTF8: {} ({u}) => {group_id}", escape_char(char::from_u32(u).unwrap())); }
262                            self.utf8_to_group.insert(char::from_u32(u).unwrap(), group_id);
263                        }
264                        left -= 1 + high - low;
265                    }
266                    low = high + 1;
267                }
268                if low <= seg.1 {
269                    if VERBOSE {
270                        println!("- SEG: {}-{} ({}-{}) => {group_id}",
271                             escape_char(char::from_u32(high + 1).unwrap()), escape_char(char::from_u32(seg.1).unwrap()),
272                             low, seg.1);
273                    }
274                    self.seg_to_group.insert(Seg(low, seg.1), group_id);
275                }
276            }
277        }
278        self.log.add_note(format!(
279            "- creating input tables: ASCII {} entries, UTF8 {} entries, segments {} entries",
280            self.ascii_to_group.len(), self.utf8_to_group.len(), self.seg_to_group.len()));
281    }
282
283    fn create_state_tables(&mut self, dfa: &Dfa<Normalized>) {
284        const VERBOSE: bool = false;
285        self.initial_state = dfa.get_initial_state().unwrap();
286        self.first_end_state = dfa.get_first_end_state().unwrap();
287        self.nbr_states = dfa.get_state_graph().len();
288        let nbr_states = dfa.get_state_graph().len();
289        // we add one extra table index to allow for the 'error group', which equals nbr_group:
290        // state_table[nbr_state * nbr_group + nbr_group] must exist; the content will be ignored.
291        let mut state_table = vec!(self.nbr_states; self.nbr_groups as usize * nbr_states + 1);
292        for (state_from, trans) in dfa.get_state_graph() {
293            if VERBOSE { println!("state {state_from}"); }
294            for (segments, state_to) in trans {
295                if VERBOSE { println!("- {segments} -> state {state_to}"); }
296                let mut segments_part = segments.clone();
297                segments_part.slice_partitions(&self.group_partition);
298                for seg in segments_part.iter() {
299                    let symbol = char::from_u32(seg.0).unwrap();
300                    let symbol_group = char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, symbol).unwrap_or(self.nbr_groups);
301                    state_table[self.nbr_groups as usize * state_from + symbol_group as usize] = *state_to;
302                }
303            }
304        }
305        self.state_table = state_table;
306        let terminal_table = dfa.get_end_states().iter()
307            .filter_map(|(&st, t)| if st >= self.first_end_state { Some(t.clone()) } else { None })
308            .to_vec();
309        self.terminal_table = terminal_table;
310        let max_token_maybe = self.terminal_table.iter().fold(None, { |acc, t|
311            if let Terminal { action: ActionOption::Token(tok), .. } = t {
312                Some(acc.unwrap_or(0).max(*tok))
313            } else {
314                acc
315            }
316        });
317        self.log.add_note(format!(
318            "- creating state tables: state table {} entries, terminal table {} entries",
319            self.state_table.len(), self.terminal_table.len()));
320        match max_token_maybe {
321            Some(max_token) => {
322                if max_token == TokenId::MAX {
323                    self.log.add_error(format!("  the token {} is taken, but it's reserved for illegal characters", TokenId::MAX));
324                }
325            }
326            None => {
327                self.log.add_error("  the lexer returns no tokens");
328            }
329        }
330    }
331    
332    pub fn write_source_code(&self, file: Option<File>, indent: usize) -> Result<(), std::io::Error> {
333        let mut out: BufWriter<Box<dyn Write>> = match file {
334            Some(file) => BufWriter::new(Box::new(file)),
335            None => BufWriter::new(Box::new(std::io::stdout().lock()))
336        };
337        let source = self.gen_source_code(indent);
338        out.write_all(source.as_bytes())?;
339        // write!(out, "{source}");
340        Ok(())
341    }
342
343    pub fn gen_source_code(&self, indent: usize) -> String {
344        indent_source(vec![self.lexer_source_code()], indent)
345    }
346
347    pub fn try_gen_source_code(self, indent: usize) -> Result<(BufLog, String), BuildError> {
348        let src = self.gen_source_code(indent);
349        if self.log.has_no_errors() {
350            Ok((self.give_log(), src))
351        } else {
352            Err(BuildError::new(self.give_log(), BuildErrorSource::LexerGen))
353        }
354    }
355
356    fn lexer_source_code(&self) -> Vec<String> {
357        let mut source = self.options.headers.clone();
358
359        if !self.options.headers.is_empty() {
360            source.push(String::new());
361        }
362
363        // Create source code:
364        source.push("use std::collections::HashMap;".to_string());
365        source.push("use std::io::Read;".to_string());
366        source.push(format!("use {}::lexer::{{ActionOption, Lexer, ModeOption, StateId, Terminal}};", self.options.lib_crate));
367        source.push(format!("use {}::segmap::{{GroupId, Seg, SegMap}};", self.options.lib_crate));
368        source.push(String::new());
369        source.push(format!("const NBR_GROUPS: u32 = {};", self.nbr_groups));
370        source.push(format!("const INITIAL_STATE: StateId = {};", self.initial_state));
371        source.push(format!("const FIRST_END_STATE: StateId = {};", self.first_end_state));
372        source.push(format!("const NBR_STATES: StateId = {};", self.nbr_states));
373        let mut groups = vec![BTreeSet::new(); self.nbr_groups as usize];
374        source.push("static ASCII_TO_GROUP: [GroupId; 128] = [".to_string());
375        for i in 0..8_usize {
376            let mut s = "    ".to_string();
377            for j in 0..16_usize {
378                let ascii = i * 16 + j;
379                let group = self.ascii_to_group[i * 16 + j];
380                s.push_str(&format!("{:3}, ", group));
381                if group < self.nbr_groups {
382                    groups[group as usize].insert(char::from(ascii as u8));
383                }
384            }
385            source.push(format!("{s}  // {}-{}", i*16, i*16 + 15));
386        }
387        source.push("];".to_string());
388        source.push(format!("static UTF8_TO_GROUP: [(char, GroupId); {}] = [", self.utf8_to_group.len()));
389        let mut hashmap_content = self.utf8_to_group.iter().map(|(c, g)| format!("    ('{}', {}),", escape_char(*c), g)).to_vec();
390        hashmap_content.sort(); // the sources must be identical from one run to the next
391        source.extend(hashmap_content);
392        source.push("];".to_string());
393        /*
394        for (c, g) in lexergen.utf8_to_group.iter() {
395            groups[*g as usize].insert(*c);
396        }
397        for (g, chars) in groups.iter().enumerate() {
398            if !chars.is_empty() {
399                let set = chars.iter().map(|c| escape_char(*c)).collect::<String>();
400                source.push(format!("// group[{g:3}] = [{set}]"));
401            };
402        }
403        */
404        source.push(format!("static SEG_TO_GROUP: [(Seg, GroupId); {}] = [", self.seg_to_group.len()));
405        for (s, g) in &self.seg_to_group {
406            source.push(format!("    (Seg({}, {}), {}),", s.0, s.1, g));
407        }
408        source.push("];".to_string());
409        source.push(format!("static TERMINAL_TABLE: [Terminal;{}] = [", self.terminal_table.len()));
410        for t in &self.terminal_table {
411            // Terminal { action: TermAction::Skip, channel: 0, mode: ModeOption::None, mode_state: None, pop: false },
412            source.push(format!("    Terminal {{ action: ActionOption::{:?}, channel: {}, mode: ModeOption::{:?}, mode_state: {:?}, pop: {} }},",
413                                t.action, t.channel, t.mode, t.mode_state, t.pop
414            ));
415        }
416        source.push("];".to_string());
417        source.push(format!("static STATE_TABLE: [StateId; {}] = [", self.state_table.len()));
418        for i in 0..self.nbr_states {
419            source.push(format!("    {}, // state {}{}",
420                (0..self.nbr_groups as usize).map(|j| format!("{:3}", self.state_table[i * self.nbr_groups as usize + j])).join(", "),
421                i,
422                if i >= self.first_end_state { format!(" {}", self.terminal_table[i - self.first_end_state] ) } else { "".to_string() }
423            ));
424        }
425        source.push(format!("    {:3} // error group in [nbr_state * nbr_group + nbr_group]", self.state_table[self.state_table.len() - 1]));
426        source.push("];".to_string());
427        source.push(String::new());
428        source.push("pub fn build_lexer<R: Read>() -> Lexer<'static, R> {".to_string());
429        source.push("    Lexer::new(".to_string());
430        source.push("        // parameters".to_string());
431        source.push("        NBR_GROUPS,".to_string());
432        source.push("        INITIAL_STATE,".to_string());
433        source.push("        FIRST_END_STATE,".to_string());
434        source.push("        NBR_STATES,".to_string());
435        source.push("        // tables".to_string());
436        source.push("        &ASCII_TO_GROUP,".to_string());
437        source.push("        HashMap::<char, GroupId>::from(UTF8_TO_GROUP),".to_string());
438        source.push("        SegMap::<GroupId>::from(SEG_TO_GROUP),".to_string());
439        source.push("        &STATE_TABLE,".to_string());
440        source.push("        &TERMINAL_TABLE,".to_string());
441        source.push("    )".to_string());
442        source.push("}".to_string());
443        source
444    }
445}
446
447impl LogReader for LexerGen {
448    type Item = BufLog;
449
450    fn get_log(&self) -> &Self::Item {
451        &self.log
452    }
453
454    fn give_log(self) -> Self::Item {
455        self.log
456    }
457}
458
459impl HasBuildErrorSource for LexerGen {
460    const SOURCE: BuildErrorSource = BuildErrorSource::LexerGen;
461}
462
463impl BuildFrom<Dfa<Normalized>> for LexerGen {
464    fn build_from(dfa: Dfa<Normalized>) -> Self {
465        let mut lexgen = LexerGen::new();
466        lexgen.make_from_dfa(dfa);
467        lexgen
468    }
469}
470
471impl Default for LexerGenOptions {
472    fn default() -> Self {
473        LexerGenOptions {
474            headers: vec![],
475            lib_crate: LexigramCrate::Core,
476        }
477    }
478}
479
480// ---------------------------------------------------------------------------------------------
481// Supporting functions
482
483// todo: option to split ASCII range?
484fn partition_symbols(g: &BTreeMap<StateId, BTreeMap<Segments, StateId>>) -> Vec<Segments> {
485    const VERBOSE: bool = false;
486    let mut groups = Vec::new();
487    #[cfg(test)] if VERBOSE { print_graph(g, None, 4); }
488    #[allow(clippy::for_kv_map)]
489    for (_state, branches) in g {
490        // branches from a given state
491        let mut map = BTreeMap::<StateId, Segments>::new();
492        for (segments, destination) in branches {
493            if let Some(i) = map.get_mut(destination) {
494                i.extend(&mut segments.iter());
495            } else {
496                map.insert(*destination, segments.clone());
497            }
498        }
499        // optimizes the segments, in case it's not already done
500        for segments in map.values_mut() {
501            segments.normalize();
502        }
503        #[cfg(test)] if VERBOSE { println!("{_state} => {}", map.values().map(|i| format!("{i:X}")).join(", ")); }
504        let mut state_sub = map.into_values().collect::<BTreeSet<Segments>>();
505        while let Some(mut sub) = state_sub.pop_first() {
506            if VERBOSE { println!("- sub = {sub}"); }
507            for i in 0..groups.len() {
508                if VERBOSE { println!("  - groups[{i}] = {}", groups[i]); }
509                let cmp = sub.intersect(&groups[i]);
510                if cmp.common.is_empty() {
511                    if VERBOSE { println!("    (disjoints)"); }
512                } else {
513                    // groups[i] is split { cmp.common, cmp.external }
514                    groups[i] = cmp.common;
515                    if !cmp.external.is_empty() {
516                        if VERBOSE { println!("    -> push {}", cmp.external); }
517                        groups.push(cmp.external);
518                    }
519                    // sub is split { cmp.common, cmp.internal }, and we discard cmp.common since already in groups
520                    if VERBOSE { println!("    -> sub = {}", cmp.internal); }
521                    sub = cmp.internal;
522                    if sub.is_empty() {
523                        break;
524                    }
525                }
526            }
527            if !sub.is_empty() {
528                groups.push(sub);
529            }
530        }
531    }
532    #[cfg(test)] if VERBOSE { println!("=> {}", groups.iter().map(|i| format!("{i:X}")).join(", ")); }
533    groups
534}