syntax_parser_generator/lex/
lexical_analyzer.rs1use std::collections::HashMap;
2use std::hash::Hash;
3
4use crate::automata::dfa::Dfa;
5use crate::automata::nfa::Nfa;
6use crate::handles::specials::AutomaticallyHandled;
7use crate::lex::{Lexeme, LexemeDescriptor};
8use crate::lex::lexeme_iterator::LexemeIterator;
9use crate::readers::Reader;
10
11impl AutomaticallyHandled for u8 {
12 type HandleCoreType = u8;
13 fn serial(&self) -> usize {
14 *self as usize
15 }
16}
17
18pub struct LexicalAnalyzer<LexemeType> {
24 dfa: Dfa<u8, LexemeType>,
25}
26
27impl<LexemeType> LexicalAnalyzer<LexemeType>
29where
30 LexemeType: Hash + Eq + Clone,
31{
32 pub fn new(
37 lexeme_descriptors: Vec<LexemeDescriptor<LexemeType>>,
38 ) -> LexicalAnalyzer<LexemeType> {
39 let mut nfa = Nfa::new();
40 let global_start_state = nfa.new_state();
41 nfa.set_initial_state(global_start_state);
42
43 let mut priority_map = HashMap::new();
44
45 for (
46 priority,
47 LexemeDescriptor {
48 pattern,
49 lexeme_type,
50 },
51 ) in lexeme_descriptors.iter().enumerate()
52 {
53 let (pattern_start_state, pattern_end_state) = pattern.build_into_nfa(&mut nfa);
54 nfa.link(global_start_state, pattern_start_state, None);
55 nfa.label(pattern_end_state, Some(lexeme_type.clone()));
56
57 priority_map.insert(lexeme_type, priority);
58 }
59
60 let dfa = nfa
61 .compile_to_dfa(|labels| {
62 labels
63 .iter()
64 .min_by_key(|&&lexeme_type| priority_map.get(lexeme_type))
65 .cloned()
66 .cloned()
67 })
68 .minimize();
69
70 let initial_state = dfa.get_initial_state().expect(
72 "Minimized DFA should have an initial state, as the associated NFA had one, and \
73 the initial state ",
74 );
75 if !dfa.get_label(initial_state).is_none() {
76 panic!(
77 "Tried to create a lexical analyzer where some lexeme type is associated with a \
78 regex accepting the empty string, which will make it get stuck on input exhaustion"
79 )
80 }
81
82 LexicalAnalyzer { dfa }
83 }
84
85 pub fn analyze<'a>(
100 &'a self,
101 reader: &'a mut impl Reader<u8>,
102 ) -> impl Iterator<Item=Lexeme<LexemeType>> + 'a {
103 LexemeIterator::new(self, reader)
104 }
105
106 fn identify_next_lexeme(
107 &self,
108 reader: &mut impl Reader<u8>,
109 ) -> LexemeIdentificationResult<LexemeType> {
110 let mut recent_lexeme_type: Option<LexemeType> = None;
111 let mut current_state = self.dfa.get_initial_state();
112
113 let mut is_string_empty = true;
114
115 loop {
116 match current_state {
117 None => break,
118
119 Some(state) => {
120 if let Some(lexeme_type) = self.dfa.get_label(state) {
121 recent_lexeme_type = Some(lexeme_type.clone());
122 reader.set_tail();
123 }
124
125 match reader.read_next() {
126 None => {
127 break;
128 }
129 Some(next_byte) => {
130 current_state = self.dfa.step(state, next_byte.handle());
131 is_string_empty = false;
132 }
133 }
134 }
135 }
136 }
137
138 return if is_string_empty {
139 LexemeIdentificationResult::InputExhausted
140 } else if let Some(lexeme_type) = recent_lexeme_type {
141 LexemeIdentificationResult::Identified(lexeme_type)
142 } else {
143 LexemeIdentificationResult::LexicalError
145 };
146 }
147
148 pub(super) fn collect_next_lexeme(
149 &self,
150 reader: &mut impl Reader<u8>,
151 ) -> Option<Lexeme<LexemeType>> {
152 let lexeme_type = loop {
153 match self.identify_next_lexeme(reader) {
154 LexemeIdentificationResult::Identified(lexeme_type) => break lexeme_type,
155 LexemeIdentificationResult::InputExhausted => return None,
156 LexemeIdentificationResult::LexicalError => {
157 self.error_recovery_routine(reader);
158 }
159 }
160 };
161
162 let contents = String::from_utf8(reader.get_sequence().collect())
163 .expect("Tokens from lexically-analyzed Reader<u8> are expected to be UTF-8 encoded");
164 let lexeme = Lexeme {
165 lexeme_type,
166 contents,
167 };
168 reader.restart_from_tail();
169 Some(lexeme)
170 }
171
172 fn error_recovery_routine(&self, _reader: &mut impl Reader<u8>) {
173 panic!("Reader had a lexical error in it, and error recovery is not yet implemented");
175 }
176}
177
178enum LexemeIdentificationResult<LexemeType> {
179 Identified(LexemeType),
180 InputExhausted,
181 LexicalError,
182}