1pub(crate) mod tests;
4
5use std::collections::{BTreeMap, BTreeSet, HashMap};
6use std::fmt::Display;
7use std::fs::File;
8use std::io::{BufWriter, Read, Write};
9use iter_index::IndexerIterator;
10use lexigram_core::CollectJoin;
11#[cfg(test)]
12use crate::dfa::print_graph;
13use crate::{indent_source, Normalized, SymbolTable, TokenId};
14use crate::char_reader::escape_char;
15use crate::lexer::{ActionOption, Lexer, StateId, Terminal};
16use lexigram_core::log::{BufLog, LogReader, LogStatus, Logger};
17use crate::build::{BuildError, BuildErrorSource, BuildFrom, HasBuildErrorSource, TryBuildFrom};
18use crate::segments::Segments;
19use crate::segmap::{char_to_group, GroupId, Seg, SegMap};
20use super::dfa::*;
21
22pub struct LexerTables {
32 nbr_groups: u32,
34 initial_state: StateId,
35 first_end_state: StateId, nbr_states: StateId, ascii_to_group: Vec<GroupId>,
39 utf8_to_group: HashMap<char, GroupId>,
40 seg_to_group: SegMap<GroupId>,
41 state_table: Vec<StateId>,
42 terminal_table: Vec<Terminal>, }
44
45impl LexerTables {
46 pub fn new(
47 nbr_groups: u32,
49 initial_state: StateId,
50 first_end_state: StateId, nbr_states: StateId, ascii_to_group: Vec<GroupId>,
54 utf8_to_group: HashMap<char, GroupId>,
55 seg_to_group: SegMap<GroupId>,
56 state_table: Vec<StateId>,
57 terminal_table: Vec<Terminal>, ) -> Self {
59 LexerTables {
60 nbr_groups,
61 initial_state,
62 first_end_state,
63 nbr_states,
64 ascii_to_group,
65 utf8_to_group,
66 seg_to_group,
67 state_table,
68 terminal_table,
69 }
70 }
71
72 pub fn make_lexer<R: Read>(&self) -> Lexer<'_, R> {
73 Lexer::new(
74 self.nbr_groups,
75 self.initial_state,
76 self.first_end_state,
77 self.nbr_states,
78 self.ascii_to_group.as_slice(),
79 self.utf8_to_group.clone(),
80 self.seg_to_group.clone(),
81 self.state_table.as_slice(),
82 self.terminal_table.as_slice(),
83 )
84 }
85}
86
87impl BuildFrom<LexerGen> for LexerTables {
88 fn build_from(lexer_gen: LexerGen) -> LexerTables {
89 assert!(!lexer_gen.state_table.is_empty(), "tables are not built");
90 LexerTables::new(
91 lexer_gen.nbr_groups,
92 lexer_gen.initial_state,
93 lexer_gen.first_end_state,
94 lexer_gen.nbr_states,
95 lexer_gen.ascii_to_group,
96 lexer_gen.utf8_to_group,
97 lexer_gen.seg_to_group,
98 lexer_gen.state_table,
99 lexer_gen.terminal_table,
100 )
101 }
102}
103
104impl TryBuildFrom<LexerGen> for LexerTables {
106 type Error = BuildError;
107
108 fn try_build_from(source: LexerGen) -> Result<Self, Self::Error> {
109 if source.get_log().has_no_errors() {
110 Ok(LexerTables::build_from(source))
111 } else {
112 Err(BuildError::new(source.give_log(), BuildErrorSource::LexerGen))
113 }
114 }
115}
116
117#[derive(Clone, Default, PartialEq, Debug)]
121pub enum LexigramCrate {
122 #[default]
124 Core,
125 Full,
127 Custom(String),
129}
130
131impl Display for LexigramCrate {
132 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
133 write!(f, "{}", match self {
134 LexigramCrate::Core => "lexigram_core",
135 LexigramCrate::Full => "lexigram_lib",
136 LexigramCrate::Custom(s) => s.as_str(),
137 })
138 }
139}
140
141pub struct LexerGen {
142 pub max_utf8_chars: u32,
144 pub nbr_groups: u32,
145 pub initial_state: StateId,
146 pub first_end_state: StateId, pub nbr_states: StateId, pub ascii_to_group: Vec<GroupId>,
150 pub utf8_to_group: HashMap<char, GroupId>,
151 pub seg_to_group: SegMap<GroupId>,
152 pub state_table: Vec<StateId>,
153 pub terminal_table: Vec<Terminal>, pub symbol_table: Option<SymbolTable>,
155 log: BufLog,
157 group_partition: Segments, headers: Vec<String>,
159 lib_crate: LexigramCrate,
160}
161
162impl LexerGen {
163 pub const DEFAULT_UTF8_TABLE_SIZE: u32 = 128;
164
165 fn new() -> Self {
166 LexerGen {
167 max_utf8_chars: Self::DEFAULT_UTF8_TABLE_SIZE,
168 nbr_groups: 0,
169 initial_state: 0,
170 first_end_state: 0,
171 nbr_states: 0,
172 ascii_to_group: vec![GroupId::MAX; 128],
173 utf8_to_group: HashMap::default(),
174 seg_to_group: SegMap::new(),
175 state_table: Vec::new(),
176 terminal_table: Vec::new(),
177 symbol_table: None,
178 log: BufLog::new(),
179 group_partition: Segments::empty(),
180 headers: Vec::new(),
181 lib_crate: LexigramCrate::Core,
182 }
183 }
184
185 #[inline]
186 pub fn add_header<T: Into<String>>(&mut self, header: T) {
187 self.headers.push(header.into());
188 }
189
190 #[inline]
191 pub fn extend_headers<I: IntoIterator<Item=T>, T: Into<String>>(&mut self, headers: I) {
192 self.headers.extend(headers.into_iter().map(|s| s.into()));
193 }
194
195 #[inline]
196 pub fn use_full_lib(&mut self, use_full_lib: bool) {
197 self.lib_crate = if use_full_lib { LexigramCrate::Full } else { LexigramCrate::Core };
198 }
199
200 #[inline]
201 pub fn set_crate(&mut self, lcrate: LexigramCrate) {
202 self.lib_crate = lcrate;
203 }
204
205 pub fn build_from_dfa(dfa: Dfa<Normalized>, max_utf8_chars: u32) -> Self {
206 let mut lexergen = Self::new();
207 lexergen.max_utf8_chars = max_utf8_chars;
208 lexergen.make_from_dfa(dfa);
209 lexergen
210 }
211
212 fn make_from_dfa(&mut self, mut dfa: Dfa<Normalized>) {
213 self.log.extend(std::mem::replace(&mut dfa.log, BufLog::new()));
214 self.log.add_note("creating lexer from DFA...");
215 self.create_input_tables(&dfa);
216 self.create_state_tables(&dfa);
217 }
218
219 fn create_input_tables(&mut self, dfa: &Dfa<Normalized>) {
220 const MAX_UTF8_SEG_RANGE: u32 = 16;
223 const VERBOSE: bool = false;
224 let symbol_part = partition_symbols(dfa.get_state_graph());
225 let symbol_to_group = SegMap::from_iter(
226 symbol_part.iter().index().flat_map(|(id, i)| i.iter().map(move |ab| (*ab, id)))
227 );
228 self.group_partition = Segments::from_iter(symbol_to_group.keys().cloned());
229
230 if VERBOSE {
231 println!("symbol partition:{}\ntables:", symbol_to_group.iter()
232 .map(|(seg, g)| format!("\n- {seg} -> {g}")).collect::<String>());
233 }
234 self.nbr_groups = symbol_part.len() as GroupId;
235 let error_id = self.nbr_groups as GroupId;
236 self.ascii_to_group.fill(error_id);
237 self.utf8_to_group.clear();
238 self.seg_to_group.clear();
239 let mut left = self.max_utf8_chars;
240 for (seg, group_id) in symbol_to_group {
241 if VERBOSE { println!("Seg: {}-{}", seg.0, seg.1); }
242 if seg.0 < 128 {
243 if VERBOSE {
244 println!("- ASCII: {}-{} ({}-{}) => {group_id}",
245 escape_char(char::from_u32(seg.0).unwrap()), escape_char(char::from_u32(seg.1.min(127)).unwrap()),
246 seg.0, seg.1.min(127));
247 }
248 for b in seg.0..=seg.1.min(127) {
249 self.ascii_to_group[b as usize] = group_id;
250 }
251 }
252 if seg.1 >= 128 {
253 let mut low = 128.max(seg.0);
254 let high = seg.1.min(low + left - 1);
255 if seg.1 - low < MAX_UTF8_SEG_RANGE {
256 if left > 0 {
257 for u in low..=high {
258 if VERBOSE { println!("- UTF8: {} ({u}) => {group_id}", escape_char(char::from_u32(u).unwrap())); }
259 self.utf8_to_group.insert(char::from_u32(u).unwrap(), group_id);
260 }
261 left -= 1 + high - low;
262 }
263 low = high + 1;
264 }
265 if low <= seg.1 {
266 if VERBOSE {
267 println!("- SEG: {}-{} ({}-{}) => {group_id}",
268 escape_char(char::from_u32(high + 1).unwrap()), escape_char(char::from_u32(seg.1).unwrap()),
269 low, seg.1);
270 }
271 self.seg_to_group.insert(Seg(low, seg.1), group_id);
272 }
273 }
274 }
275 self.log.add_note(format!(
276 "- creating input tables: ASCII {} entries, UTF8 {} entries, segments {} entries",
277 self.ascii_to_group.len(), self.utf8_to_group.len(), self.seg_to_group.len()));
278 }
279
280 fn create_state_tables(&mut self, dfa: &Dfa<Normalized>) {
281 const VERBOSE: bool = false;
282 self.initial_state = dfa.get_initial_state().unwrap();
283 self.first_end_state = dfa.get_first_end_state().unwrap();
284 self.nbr_states = dfa.get_state_graph().len();
285 let nbr_states = dfa.get_state_graph().len();
286 let mut state_table = vec!(self.nbr_states; self.nbr_groups as usize * nbr_states + 1);
289 for (state_from, trans) in dfa.get_state_graph() {
290 if VERBOSE { println!("state {state_from}"); }
291 for (segments, state_to) in trans {
292 if VERBOSE { println!("- {segments} -> state {state_to}"); }
293 let mut segments_part = segments.clone();
294 segments_part.slice_partitions(&self.group_partition);
295 for seg in segments_part.iter() {
296 let symbol = char::from_u32(seg.0).unwrap();
297 let symbol_group = char_to_group(&self.ascii_to_group, &self.utf8_to_group, &self.seg_to_group, symbol).unwrap_or(self.nbr_groups);
298 state_table[self.nbr_groups as usize * state_from + symbol_group as usize] = *state_to;
299 }
300 }
301 }
302 self.state_table = state_table;
303 let terminal_table = dfa.get_end_states().iter()
304 .filter_map(|(&st, t)| if st >= self.first_end_state { Some(t.clone()) } else { None })
305 .to_vec();
306 self.terminal_table = terminal_table;
307 let max_token_maybe = self.terminal_table.iter().fold(None, { |acc, t|
308 if let Terminal { action: ActionOption::Token(tok), .. } = t {
309 Some(acc.unwrap_or(0).max(*tok))
310 } else {
311 acc
312 }
313 });
314 self.log.add_note(format!(
315 "- creating state tables: state table {} entries, terminal table {} entries",
316 self.state_table.len(), self.terminal_table.len()));
317 match max_token_maybe {
318 Some(max_token) => {
319 if max_token == TokenId::MAX {
320 self.log.add_error(format!(" the token {} is taken, but it's reserved for illegal characters", TokenId::MAX));
321 }
322 }
323 None => {
324 self.log.add_error(" the lexer returns no tokens");
325 }
326 }
327 }
328
329 pub fn write_source_code(&self, file: Option<File>, indent: usize) -> Result<(), std::io::Error> {
330 let mut out: BufWriter<Box<dyn Write>> = match file {
331 Some(file) => BufWriter::new(Box::new(file)),
332 None => BufWriter::new(Box::new(std::io::stdout().lock()))
333 };
334 let source = self.gen_source_code(indent);
335 out.write(source.as_bytes())?;
336 Ok(())
338 }
339
340 pub fn gen_source_code(&self, indent: usize) -> String {
341 indent_source(vec![self.lexer_source_code()], indent)
342 }
343
344 pub fn try_gen_source_code(self, indent: usize) -> Result<(BufLog, String), BuildError> {
345 let src = self.gen_source_code(indent);
346 if self.log.has_no_errors() {
347 Ok((self.give_log(), src))
348 } else {
349 Err(BuildError::new(self.give_log(), BuildErrorSource::LexerGen))
350 }
351 }
352
353 fn lexer_source_code(&self) -> Vec<String> {
354 let mut source = self.headers.clone();
355
356 if !self.headers.is_empty() {
357 source.push(String::new());
358 }
359
360 source.push(format!("use std::collections::HashMap;"));
362 source.push(format!("use std::io::Read;"));
363 source.push(format!("use {}::lexer::{{ActionOption, Lexer, ModeOption, StateId, Terminal}};", self.lib_crate));
364 source.push(format!("use {}::segmap::{{GroupId, Seg, SegMap}};", self.lib_crate));
365 source.push(String::new());
366 source.push(format!("const NBR_GROUPS: u32 = {};", self.nbr_groups));
367 source.push(format!("const INITIAL_STATE: StateId = {};", self.initial_state));
368 source.push(format!("const FIRST_END_STATE: StateId = {};", self.first_end_state));
369 source.push(format!("const NBR_STATES: StateId = {};", self.nbr_states));
370 let mut groups = vec![BTreeSet::new(); self.nbr_groups as usize];
371 source.push(format!("static ASCII_TO_GROUP: [GroupId; 128] = ["));
372 for i in 0..8_usize {
373 let mut s = " ".to_string();
374 for j in 0..16_usize {
375 let ascii = i * 16 + j;
376 let group = self.ascii_to_group[i * 16 + j];
377 s.push_str(&format!("{:3}, ", group));
378 if group < self.nbr_groups {
379 groups[group as usize].insert(char::from(ascii as u8));
380 }
381 }
382 source.push(format!("{s} // {}-{}", i*16, i*16 + 15));
383 }
384 source.push(format!("];"));
385 source.push(format!("static UTF8_TO_GROUP: [(char, GroupId); {}] = [", self.utf8_to_group.len()));
386 let mut hashmap_content = self.utf8_to_group.iter().map(|(c, g)| format!(" ('{}', {}),", escape_char(*c), g)).to_vec();
387 hashmap_content.sort(); source.extend(hashmap_content);
389 source.push(format!("];"));
390 source.push(format!("static SEG_TO_GROUP: [(Seg, GroupId); {}] = [", self.seg_to_group.len()));
402 for (s, g) in &self.seg_to_group {
403 source.push(format!(" (Seg({}, {}), {}),", s.0, s.1, g));
404 }
405 source.push(format!("];"));
406 source.push(format!("static TERMINAL_TABLE: [Terminal;{}] = [", self.terminal_table.len()));
407 for t in &self.terminal_table {
408 source.push(format!(" Terminal {{ action: ActionOption::{:?}, channel: {}, mode: ModeOption::{:?}, mode_state: {:?}, pop: {} }},",
410 t.action, t.channel, t.mode, t.mode_state, t.pop
411 ));
412 }
413 source.push(format!("];"));
414 source.push(format!("static STATE_TABLE: [StateId; {}] = [", self.state_table.len()));
415 for i in 0..self.nbr_states as usize {
416 source.push(format!(" {}, // state {}{}",
417 (0..self.nbr_groups as usize).map(|j| format!("{:3}", self.state_table[i * self.nbr_groups as usize + j])).join(", "),
418 i,
419 if i >= self.first_end_state { format!(" {}", self.terminal_table[i - self.first_end_state] ) } else { "".to_string() }
420 ));
421 }
422 source.push(format!(" {:3} // error group in [nbr_state * nbr_group + nbr_group]", self.state_table[self.state_table.len() - 1]));
423 source.push(format!("];"));
424 source.push(String::new());
425 source.push(format!("pub fn build_lexer<R: Read>() -> Lexer<'static, R> {{"));
426 source.push(format!(" Lexer::new("));
427 source.push(format!(" // parameters"));
428 source.push(format!(" NBR_GROUPS,"));
429 source.push(format!(" INITIAL_STATE,"));
430 source.push(format!(" FIRST_END_STATE,"));
431 source.push(format!(" NBR_STATES,"));
432 source.push(format!(" // tables"));
433 source.push(format!(" &ASCII_TO_GROUP,"));
434 source.push(format!(" HashMap::<char, GroupId>::from(UTF8_TO_GROUP),"));
435 source.push(format!(" SegMap::<GroupId>::from(SEG_TO_GROUP),"));
436 source.push(format!(" &STATE_TABLE,"));
437 source.push(format!(" &TERMINAL_TABLE,"));
438 source.push(format!(" )"));
439 source.push(format!("}}"));
440 source
441 }
442}
443
444impl LogReader for LexerGen {
445 type Item = BufLog;
446
447 fn get_log(&self) -> &Self::Item {
448 &self.log
449 }
450
451 fn give_log(self) -> Self::Item {
452 self.log
453 }
454}
455
456impl HasBuildErrorSource for LexerGen {
457 const SOURCE: BuildErrorSource = BuildErrorSource::LexerGen;
458}
459
460impl BuildFrom<Dfa<Normalized>> for LexerGen {
461 fn build_from(dfa: Dfa<Normalized>) -> Self {
462 let mut lexgen = LexerGen::new();
463 lexgen.make_from_dfa(dfa);
464 lexgen
465 }
466}
467
468fn partition_symbols(g: &BTreeMap<StateId, BTreeMap<Segments, StateId>>) -> Vec<Segments> {
473 const VERBOSE: bool = false;
474 let mut groups = Vec::new();
475 #[cfg(test)] if VERBOSE { print_graph(g, None, 4); }
476 for (_state, branches) in g {
477 let mut map = BTreeMap::<StateId, Segments>::new();
479 for (segments, destination) in branches {
480 if let Some(i) = map.get_mut(destination) {
481 i.extend(&mut segments.iter());
482 } else {
483 map.insert(*destination, segments.clone());
484 }
485 }
486 for segments in map.values_mut() {
488 segments.normalize();
489 }
490 #[cfg(test)] if VERBOSE { println!("{_state} => {}", map.values().map(|i| format!("{i:X}")).join(", ")); }
491 let mut state_sub = map.into_values().collect::<BTreeSet<Segments>>();
492 while let Some(mut sub) = state_sub.pop_first() {
493 if VERBOSE { println!("- sub = {sub}"); }
494 for i in 0..groups.len() {
495 if VERBOSE { println!(" - groups[{i}] = {}", groups[i]); }
496 let cmp = sub.intersect(&groups[i]);
497 if cmp.common.is_empty() {
498 if VERBOSE { println!(" (disjoints)"); }
499 } else {
500 groups[i] = cmp.common;
502 if !cmp.external.is_empty() {
503 if VERBOSE { println!(" -> push {}", cmp.external); }
504 groups.push(cmp.external);
505 }
506 if VERBOSE { println!(" -> sub = {}", cmp.internal); }
508 sub = cmp.internal;
509 if sub.is_empty() {
510 break;
511 }
512 }
513 }
514 if !sub.is_empty() {
515 groups.push(sub);
516 }
517 }
518 }
519 #[cfg(test)] if VERBOSE { println!("=> {}", groups.iter().map(|i| format!("{i:X}")).join(", ")); }
520 groups
521}