syntax_parser_generator/lex/
lexical_analyzer.rs

1use std::collections::HashMap;
2use std::hash::Hash;
3
4use crate::automata::dfa::Dfa;
5use crate::automata::nfa::Nfa;
6use crate::handles::specials::AutomaticallyHandled;
7use crate::lex::{Lexeme, LexemeDescriptor};
8use crate::lex::lexeme_iterator::LexemeIterator;
9use crate::readers::Reader;
10
11impl AutomaticallyHandled for u8 {
12    type HandleCoreType = u8;
13    fn serial(&self) -> usize {
14        *self as usize
15    }
16}
17
18/// A lexical analyzer.
19///
20/// A lexical analyzer is a computation unit (an automaton), that is capable of reading a stream
21/// of characters, and separating it into [Lexeme]s: atomic sequences units of meaningful text,
22/// tokens. See [crate::lex] for more detail.
23pub struct LexicalAnalyzer<LexemeType> {
24    dfa: Dfa<u8, LexemeType>,
25}
26
27// Earlier lexeme descriptors are prioritized
28impl<LexemeType> LexicalAnalyzer<LexemeType>
29where
30    LexemeType: Hash + Eq + Clone,
31{
32    /// Builds a new [LexicalAnalyzer].
33    ///
34    /// The different `LexemeType`s that the analyzer will be capable of recognizing are described
35    /// by `lexeme_descriptors`.
36    pub fn new(
37        lexeme_descriptors: Vec<LexemeDescriptor<LexemeType>>,
38    ) -> LexicalAnalyzer<LexemeType> {
39        let mut nfa = Nfa::new();
40        let global_start_state = nfa.new_state();
41        nfa.set_initial_state(global_start_state);
42
43        let mut priority_map = HashMap::new();
44
45        for (
46            priority,
47            LexemeDescriptor {
48                pattern,
49                lexeme_type,
50            },
51        ) in lexeme_descriptors.iter().enumerate()
52        {
53            let (pattern_start_state, pattern_end_state) = pattern.build_into_nfa(&mut nfa);
54            nfa.link(global_start_state, pattern_start_state, None);
55            nfa.label(pattern_end_state, Some(lexeme_type.clone()));
56
57            priority_map.insert(lexeme_type, priority);
58        }
59
60        let dfa = nfa
61            .compile_to_dfa(|labels| {
62                labels
63                    .iter()
64                    .min_by_key(|&&lexeme_type| priority_map.get(lexeme_type))
65                    .cloned()
66                    .cloned()
67            })
68            .minimize();
69
70        // Make initial state is unlabeled, so we won't get stuck on epsilon when input is exhausted
71        let initial_state = dfa.get_initial_state().expect(
72            "Minimized DFA should have an initial state, as the associated NFA had one, and \
73            the initial state ",
74        );
75        if !dfa.get_label(initial_state).is_none() {
76            panic!(
77                "Tried to create a lexical analyzer where some lexeme type is associated with a \
78                regex accepting the empty string, which will make it get stuck on input exhaustion"
79            )
80        }
81
82        LexicalAnalyzer { dfa }
83    }
84
85    /// Parses a stream of input text specified by a `reader`, and yields the lexemes it is consists
86    /// of.
87    ///
88    /// # Conflict Resolution
89    ///
90    /// - Longer lexemes are prioritized.
91    /// - Among the matching `LexemeType`s for a lexeme of a given length, priority is given to the
92    ///     `LexemeType` whose [LexemeDescriptor] was listed first during the [LexicalAnalyzer]'s
93    ///     construction (see the `lexeme_descriptors` argument of [LexicalAnalyzer::new]).
94    ///
95    /// # Panics
96    ///
97    /// If no known `LexemeType` could be matched against a prefix of the remaining input.
98    ///
99    pub fn analyze<'a>(
100        &'a self,
101        reader: &'a mut impl Reader<u8>,
102    ) -> impl Iterator<Item=Lexeme<LexemeType>> + 'a {
103        LexemeIterator::new(self, reader)
104    }
105
106    fn identify_next_lexeme(
107        &self,
108        reader: &mut impl Reader<u8>,
109    ) -> LexemeIdentificationResult<LexemeType> {
110        let mut recent_lexeme_type: Option<LexemeType> = None;
111        let mut current_state = self.dfa.get_initial_state();
112
113        let mut is_string_empty = true;
114
115        loop {
116            match current_state {
117                None => break,
118
119                Some(state) => {
120                    if let Some(lexeme_type) = self.dfa.get_label(state) {
121                        recent_lexeme_type = Some(lexeme_type.clone());
122                        reader.set_tail();
123                    }
124
125                    match reader.read_next() {
126                        None => {
127                            break;
128                        }
129                        Some(next_byte) => {
130                            current_state = self.dfa.step(state, next_byte.handle());
131                            is_string_empty = false;
132                        }
133                    }
134                }
135            }
136        }
137
138        return if is_string_empty {
139            LexemeIdentificationResult::InputExhausted
140        } else if let Some(lexeme_type) = recent_lexeme_type {
141            LexemeIdentificationResult::Identified(lexeme_type)
142        } else {
143            // We read some data, but couldn't identify available prefix
144            LexemeIdentificationResult::LexicalError
145        };
146    }
147
148    pub(super) fn collect_next_lexeme(
149        &self,
150        reader: &mut impl Reader<u8>,
151    ) -> Option<Lexeme<LexemeType>> {
152        let lexeme_type = loop {
153            match self.identify_next_lexeme(reader) {
154                LexemeIdentificationResult::Identified(lexeme_type) => break lexeme_type,
155                LexemeIdentificationResult::InputExhausted => return None,
156                LexemeIdentificationResult::LexicalError => {
157                    self.error_recovery_routine(reader);
158                }
159            }
160        };
161
162        let contents = String::from_utf8(reader.get_sequence().collect())
163            .expect("Tokens from lexically-analyzed Reader<u8> are expected to be UTF-8 encoded");
164        let lexeme = Lexeme {
165            lexeme_type,
166            contents,
167        };
168        reader.restart_from_tail();
169        Some(lexeme)
170    }
171
172    fn error_recovery_routine(&self, _reader: &mut impl Reader<u8>) {
173        // TODO make this configurable
174        panic!("Reader had a lexical error in it, and error recovery is not yet implemented");
175    }
176}
177
178enum LexemeIdentificationResult<LexemeType> {
179    Identified(LexemeType),
180    InputExhausted,
181    LexicalError,
182}