1pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::fmt::Display;
7use std::fs::File;
8use std::io::{BufWriter, Read, Write};
9use iter_index::IndexerIterator;
10use lexigram_core::CollectJoin;
11#[cfg(test)]
12use crate::dfa::print_graph;
13use crate::{indent_source, Normalized, SymbolTable, TokenId};
14use crate::char_reader::escape_char;
15use crate::lexer::{ActionOption, Lexer, StateId, Terminal};
16use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
17use crate::build::{BuildError, BuildErrorSource, BuildFrom, HasBuildErrorSource, TryBuildFrom};
18use crate::segments::Segments;
19use crate::segmap::{char_to_group, GroupId, Seg, SegMap};
20use super::dfa::*;
21
22pub struct LexerTables {
32 nbr_groups: u32,
34 initial_state: StateId,
35 first_end_state: StateId, nbr_states: StateId, ascii_to_group: Vec<GroupId>,
39 utf8_to_group: HashMap<char, GroupId>,
40 seg_to_group: SegMap<GroupId>,
41 state_table: Vec<StateId>,
42 terminal_table: Vec<Terminal>, }
44
45impl LexerTables {
46 pub fn new(
47 nbr_groups: u32,
49 initial_state: StateId,
50 first_end_state: StateId, nbr_states: StateId, ascii_to_group: Vec<GroupId>,
54 utf8_to_group: HashMap<char, GroupId>,
55 seg_to_group: SegMap<GroupId>,
56 state_table: Vec<StateId>,
57 terminal_table: Vec<Terminal>, ) -> Self {
59 LexerTables {
60 nbr_groups,
61 initial_state,
62 first_end_state,
63 nbr_states,
64 ascii_to_group,
65 utf8_to_group,
66 seg_to_group,
67 state_table,
68 terminal_table,
69 }
70 }
71
72 pub fn make_lexer<R: Read>(&self) -> Lexer<'_, R> {
73 Lexer::new(
74 self.nbr_groups,
75 self.initial_state,
76 self.first_end_state,
77 self.nbr_states,
78 self.ascii_to_group.as_slice(),
79 self.utf8_to_group.clone(),
80 self.seg_to_group.clone(),
81 self.state_table.as_slice(),
82 self.terminal_table.as_slice(),
83 )
84 }
85}
86
87impl BuildFrom<LexerGen> for LexerTables {
88 fn build_from(lexer_gen: LexerGen) -> LexerTables {
89 assert!(!lexer_gen.state_table.is_empty(), "tables are not built");
90 LexerTables::new(
91 lexer_gen.nbr_groups,
92 lexer_gen.initial_state,
93 lexer_gen.first_end_state,
94 lexer_gen.nbr_states,
95 lexer_gen.ascii_to_group,
96 lexer_gen.utf8_to_group,
97 lexer_gen.seg_to_group,
98 lexer_gen.state_table,
99 lexer_gen.terminal_table,
100 )
101 }
102}
103
104impl TryBuildFrom<LexerGen> for LexerTables {
106 type Error = BuildError;
107
108 fn try_build_from(source: LexerGen) -> Result<Self, Self::Error> {
109 if source.get_log().has_no_errors() {
110 Ok(LexerTables::build_from(source))
111 } else {
112 Err(BuildError::new(source.give_log(), BuildErrorSource::LexerGen))
113 }
114 }
115}
116
117#[derive(Clone, Default, PartialEq, Debug)]
121pub enum LexigramCrate {
122 #[default]
124 Core,
125 Full,
127 Custom(String),
129}
130
131impl Display for LexigramCrate {
132 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
133 write!(f, "{}", match self {
134 LexigramCrate::Core => "lexigram_core",
135 LexigramCrate::Full => "lexigram_lib",
136 LexigramCrate::Custom(s) => s.as_str(),
137 })
138 }
139}
140
141#[derive(Clone, Debug)]
142pub struct LexerGenOptions {
143 pub headers: Vec<String>,
144 pub lib_crate: LexigramCrate,
145}
146
147pub struct LexerGen {
148 pub options: LexerGenOptions,
150 pub max_utf8_chars: u32,
151 pub nbr_groups: u32,
152 pub initial_state: StateId,
153 pub first_end_state: StateId, pub nbr_states: StateId, pub ascii_to_group: Vec<GroupId>,
157 pub utf8_to_group: HashMap<char, GroupId>,
158 pub seg_to_group: SegMap<GroupId>,
159 pub state_table: Vec<StateId>,
160 pub terminal_table: Vec<Terminal>, pub symbol_table: Option<SymbolTable>,
162 log: BufLog,
164 group_partition: Segments, }
166
167impl LexerGen {
168 pub const DEFAULT_UTF8_TABLE_SIZE: u32 = 128;
169
170 fn new() -> Self {
171 LexerGen {
172 options: LexerGenOptions::default(),
173 max_utf8_chars: Self::DEFAULT_UTF8_TABLE_SIZE,
174 nbr_groups: 0,
175 initial_state: 0,
176 first_end_state: 0,
177 nbr_states: 0,
178 ascii_to_group: vec![GroupId::MAX; 128],
179 utf8_to_group: HashMap::default(),
180 seg_to_group: SegMap::new(),
181 state_table: Vec::new(),
182 terminal_table: Vec::new(),
183 symbol_table: None,
184 log: BufLog::new(),
185 group_partition: Segments::empty(),
186 }
187 }
188
189 pub fn set_options(&mut self, options: LexerGenOptions) {
190 self.options = options;
191 }
192
193 #[inline]
194 pub fn add_header<T: Into<String>>(&mut self, header: T) {
195 self.options.headers.push(header.into());
196 }
197
198 #[inline]
199 pub fn extend_headers<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, headers: I) {
200 self.options.headers.extend(headers.into_iter().map(|s| s.into()));
201 }
202
203 #[inline]
204 pub fn set_lib_crate(&mut self, lcrate: LexigramCrate) {
205 self.options.lib_crate = lcrate;
206 }
207
208 pub fn build_from_dfa(dfa: Dfa<Normalized>, max_utf8_chars: u32) -> Self {
209 let mut lexergen = Self::new();
210 lexergen.max_utf8_chars = max_utf8_chars;
211 lexergen.make_from_dfa(dfa);
212 lexergen
213 }
214
215 fn make_from_dfa(&mut self, mut dfa: Dfa<Normalized>) {
216 self.log.extend(std::mem::replace(&mut dfa.log, BufLog::new()));
217 self.log.add_note("creating lexer from DFA...");
218 self.create_input_tables(&dfa);
219 self.create_state_tables(&dfa);
220 }
221
222 fn create_input_tables(&mut self, dfa: &Dfa<Normalized>) {
223 const MAX_UTF8_SEG_RANGE: u32 = 16;
226 const VERBOSE: bool = false;
227 let symbol_part = partition_symbols(dfa.get_state_graph());
228 let symbol_to_group = SegMap::from_iter(
229 symbol_part.iter().index().flat_map(|(id, i)| i.iter().map(move |ab| (*ab, id)))
230 );
231 self.group_partition = Segments::from_iter(symbol_to_group.keys().cloned());
232
233 if VERBOSE {
234 println!("symbol partition:{}\ntables:", symbol_to_group.iter()
235 .map(|(seg, g)| format!("\n- {seg} -> {g}")).collect::<String>());
236 }
237 self.nbr_groups = symbol_part.len() as GroupId;
238 let error_id = self.nbr_groups as GroupId;
239 self.ascii_to_group.fill(error_id);
240 self.utf8_to_group.clear();
241 self.seg_to_group.clear();
242 let mut left = self.max_utf8_chars;
243 for (seg, group_id) in symbol_to_group {
244 if VERBOSE { println!("Seg: {}-{}", seg.0, seg.1); }
245 if seg.0 < 128 {
246 if VERBOSE {
247 println!("- ASCII: {}-{} ({}-{}) => {group_id}",
248 escape_char(char::from_u32(seg.0).unwrap()), escape_char(char::from_u32(seg.1.min(127)).unwrap()),
249 seg.0, seg.1.min(127));
250 }
251 for b in seg.0..=seg.1.min(127) {
252 self.ascii_to_group[b as usize] = group_id;
253 }
254 }
255 if seg.1 >= 128 {
256 let mut low = 128.max(seg.0);
257 let high = seg.1.min(low + left - 1);
258 if seg.1 - low < MAX_UTF8_SEG_RANGE {
259 if left > 0 {
260 for u in low..=high {
261 if VERBOSE { println!("- UTF8: {} ({u}) => {group_id}", escape_char(char::from_u32(u).unwrap())); }
262 self.utf8_to_group.insert(char::from_u32(u).unwrap(), group_id);
263 }
264 left -= 1 + high - low;
265 }
266 low = high + 1;
267 }
268 if low <= seg.1 {
269 if VERBOSE {
270 println!("- SEG: {}-{} ({}-{}) => {group_id}",
271 escape_char(char::from_u32(high + 1).unwrap()), escape_char(char::from_u32(seg.1).unwrap()),
272 low, seg.1);
273 }
274 self.seg_to_group.insert(Seg(low, seg.1), group_id);
275 }
276 }
277 }
278 self.log.add_note(format!(
279 "- creating input tables: ASCII {} entries, UTF8 {} entries, segments {} entries",
280 self.ascii_to_group.len(), self.utf8_to_group.len(), self.seg_to_group.len()));
281 }
282
283 fn create_state_tables(&mut self, dfa: &Dfa<Normalized>) {
284 const VERBOSE: bool = false;
285 self.initial_state = dfa.get_initial_state().unwrap();
286 self.first_end_state = dfa.get_first_end_state().unwrap();
287 self.nbr_states = dfa.get_state_graph().len();
288 let nbr_states = dfa.get_state_graph().len();
289 let mut state_table = vec!(self.nbr_states; self.nbr_groups as usize * nbr_states + 1);
292 for (state_from, trans) in dfa.get_state_graph() {
293 if VERBOSE { println!("state {state_from}"); }
294 for (segments, state_to) in trans {
295 if VERBOSE { println!("- {segments} -> state {state_to}"); }
296 let mut segments_part = segments.clone();
297 segments_part.slice_partitions(&self.group_partition);
298 for seg in segments_part.iter() {
299 let symbol = char::from_u32(seg.0).unwrap();
300 let symbol_group = char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, symbol).unwrap_or(self.nbr_groups);
301 state_table[self.nbr_groups as usize * state_from + symbol_group as usize] = *state_to;
302 }
303 }
304 }
305 self.state_table = state_table;
306 let terminal_table = dfa.get_end_states().iter()
307 .filter_map(|(&st, t)| if st >= self.first_end_state { Some(t.clone()) } else { None })
308 .to_vec();
309 self.terminal_table = terminal_table;
310 let max_token_maybe = self.terminal_table.iter().fold(None, { |acc, t|
311 if let Terminal { action: ActionOption::Token(tok), .. } = t {
312 Some(acc.unwrap_or(0).max(*tok))
313 } else {
314 acc
315 }
316 });
317 self.log.add_note(format!(
318 "- creating state tables: state table {} entries, terminal table {} entries",
319 self.state_table.len(), self.terminal_table.len()));
320 match max_token_maybe {
321 Some(max_token) => {
322 if max_token == TokenId::MAX {
323 self.log.add_error(format!(" the token {} is taken, but it's reserved for illegal characters", TokenId::MAX));
324 }
325 }
326 None => {
327 self.log.add_error(" the lexer returns no tokens");
328 }
329 }
330 }
331
332 pub fn write_source_code(&self, file: Option<File>, indent: usize) -> Result<(), std::io::Error> {
333 let mut out: BufWriter<Box<dyn Write>> = match file {
334 Some(file) => BufWriter::new(Box::new(file)),
335 None => BufWriter::new(Box::new(std::io::stdout().lock()))
336 };
337 let source = self.gen_source_code(indent);
338 out.write_all(source.as_bytes())?;
339 Ok(())
341 }
342
343 pub fn gen_source_code(&self, indent: usize) -> String {
344 indent_source(vec![self.lexer_source_code()], indent)
345 }
346
347 pub fn try_gen_source_code(self, indent: usize) -> Result<(BufLog, String), BuildError> {
348 let src = self.gen_source_code(indent);
349 if self.log.has_no_errors() {
350 Ok((self.give_log(), src))
351 } else {
352 Err(BuildError::new(self.give_log(), BuildErrorSource::LexerGen))
353 }
354 }
355
356 fn lexer_source_code(&self) -> Vec<String> {
357 let mut source = self.options.headers.clone();
358
359 if !self.options.headers.is_empty() {
360 source.push(String::new());
361 }
362
363 source.push("use std::collections::HashMap;".to_string());
365 source.push("use std::io::Read;".to_string());
366 source.push(format!("use {}::lexer::{{ActionOption, Lexer, ModeOption, StateId, Terminal}};", self.options.lib_crate));
367 source.push(format!("use {}::segmap::{{GroupId, Seg, SegMap}};", self.options.lib_crate));
368 source.push(String::new());
369 source.push(format!("const NBR_GROUPS: u32 = {};", self.nbr_groups));
370 source.push(format!("const INITIAL_STATE: StateId = {};", self.initial_state));
371 source.push(format!("const FIRST_END_STATE: StateId = {};", self.first_end_state));
372 source.push(format!("const NBR_STATES: StateId = {};", self.nbr_states));
373 let mut groups = vec![BTreeSet::new(); self.nbr_groups as usize];
374 source.push("static ASCII_TO_GROUP: [GroupId; 128] = [".to_string());
375 for i in 0..8_usize {
376 let mut s = " ".to_string();
377 for j in 0..16_usize {
378 let ascii = i * 16 + j;
379 let group = self.ascii_to_group[i * 16 + j];
380 s.push_str(&format!("{:3}, ", group));
381 if group < self.nbr_groups {
382 groups[group as usize].insert(char::from(ascii as u8));
383 }
384 }
385 source.push(format!("{s} // {}-{}", i*16, i*16 + 15));
386 }
387 source.push("];".to_string());
388 source.push(format!("static UTF8_TO_GROUP: [(char, GroupId); {}] = [", self.utf8_to_group.len()));
389 let mut hashmap_content = self.utf8_to_group.iter().map(|(c, g)| format!(" ('{}', {}),", escape_char(*c), g)).to_vec();
390 hashmap_content.sort(); source.extend(hashmap_content);
392 source.push("];".to_string());
393 source.push(format!("static SEG_TO_GROUP: [(Seg, GroupId); {}] = [", self.seg_to_group.len()));
405 for (s, g) in &self.seg_to_group {
406 source.push(format!(" (Seg({}, {}), {}),", s.0, s.1, g));
407 }
408 source.push("];".to_string());
409 source.push(format!("static TERMINAL_TABLE: [Terminal;{}] = [", self.terminal_table.len()));
410 for t in &self.terminal_table {
411 source.push(format!(" Terminal {{ action: ActionOption::{:?}, channel: {}, mode: ModeOption::{:?}, mode_state: {:?}, pop: {} }},",
413 t.action, t.channel, t.mode, t.mode_state, t.pop
414 ));
415 }
416 source.push("];".to_string());
417 source.push(format!("static STATE_TABLE: [StateId; {}] = [", self.state_table.len()));
418 for i in 0..self.nbr_states {
419 source.push(format!(" {}, // state {}{}",
420 (0..self.nbr_groups as usize).map(|j| format!("{:3}", self.state_table[i * self.nbr_groups as usize + j])).join(", "),
421 i,
422 if i >= self.first_end_state { format!(" {}", self.terminal_table[i - self.first_end_state] ) } else { "".to_string() }
423 ));
424 }
425 source.push(format!(" {:3} // error group in [nbr_state * nbr_group + nbr_group]", self.state_table[self.state_table.len() - 1]));
426 source.push("];".to_string());
427 source.push(String::new());
428 source.push("pub fn build_lexer<R: Read>() -> Lexer<'static, R> {".to_string());
429 source.push(" Lexer::new(".to_string());
430 source.push(" // parameters".to_string());
431 source.push(" NBR_GROUPS,".to_string());
432 source.push(" INITIAL_STATE,".to_string());
433 source.push(" FIRST_END_STATE,".to_string());
434 source.push(" NBR_STATES,".to_string());
435 source.push(" // tables".to_string());
436 source.push(" &ASCII_TO_GROUP,".to_string());
437 source.push(" HashMap::<char, GroupId>::from(UTF8_TO_GROUP),".to_string());
438 source.push(" SegMap::<GroupId>::from(SEG_TO_GROUP),".to_string());
439 source.push(" &STATE_TABLE,".to_string());
440 source.push(" &TERMINAL_TABLE,".to_string());
441 source.push(" )".to_string());
442 source.push("}".to_string());
443 source
444 }
445}
446
447impl LogReader for LexerGen {
448 type Item = BufLog;
449
450 fn get_log(&self) -> &Self::Item {
451 &self.log
452 }
453
454 fn give_log(self) -> Self::Item {
455 self.log
456 }
457}
458
459impl HasBuildErrorSource for LexerGen {
460 const SOURCE: BuildErrorSource = BuildErrorSource::LexerGen;
461}
462
463impl BuildFrom<Dfa<Normalized>> for LexerGen {
464 fn build_from(dfa: Dfa<Normalized>) -> Self {
465 let mut lexgen = LexerGen::new();
466 lexgen.make_from_dfa(dfa);
467 lexgen
468 }
469}
470
471impl Default for LexerGenOptions {
472 fn default() -> Self {
473 LexerGenOptions {
474 headers: vec![],
475 lib_crate: LexigramCrate::Core,
476 }
477 }
478}
479
480fn partition_symbols(g: &BTreeMap<StateId, BTreeMap<Segments, StateId>>) -> Vec<Segments> {
485 const VERBOSE: bool = false;
486 let mut groups = Vec::new();
487 #[cfg(test)] if VERBOSE { print_graph(g, None, 4); }
488 #[allow(clippy::for_kv_map)]
489 for (_state, branches) in g {
490 let mut map = BTreeMap::<StateId, Segments>::new();
492 for (segments, destination) in branches {
493 if let Some(i) = map.get_mut(destination) {
494 i.extend(&mut segments.iter());
495 } else {
496 map.insert(*destination, segments.clone());
497 }
498 }
499 for segments in map.values_mut() {
501 segments.normalize();
502 }
503 #[cfg(test)] if VERBOSE { println!("{_state} => {}", map.values().map(|i| format!("{i:X}")).join(", ")); }
504 let mut state_sub = map.into_values().collect::<BTreeSet<Segments>>();
505 while let Some(mut sub) = state_sub.pop_first() {
506 if VERBOSE { println!("- sub = {sub}"); }
507 for i in 0..groups.len() {
508 if VERBOSE { println!(" - groups[{i}] = {}", groups[i]); }
509 let cmp = sub.intersect(&groups[i]);
510 if cmp.common.is_empty() {
511 if VERBOSE { println!(" (disjoints)"); }
512 } else {
513 groups[i] = cmp.common;
515 if !cmp.external.is_empty() {
516 if VERBOSE { println!(" -> push {}", cmp.external); }
517 groups.push(cmp.external);
518 }
519 if VERBOSE { println!(" -> sub = {}", cmp.internal); }
521 sub = cmp.internal;
522 if sub.is_empty() {
523 break;
524 }
525 }
526 }
527 if !sub.is_empty() {
528 groups.push(sub);
529 }
530 }
531 }
532 #[cfg(test)] if VERBOSE { println!("=> {}", groups.iter().map(|i| format!("{i:X}")).join(", ")); }
533 groups
534}