syntax_parser_generator/lex/mod.rs
1//! Build and manage lexical analyzers.
2//!
3//! The first step of the syntax-parsing pipeline is called _lexical-analysis_. During this phase,
4//! The input text is separated into consecutive sequences of characters that have some atomic
5//! syntactic meaning, known as [lexemes](Lexeme) (or tokens).
6//!
7//! Lexemes are usually classified into categories, or _lexeme types_, that identify "groups" of
8//! lexemes that have similar syntactic meaning: identifier, integer literal, operator, white
9//! space, etc. Each category is specified by a [LexemeDescriptor], which defines the
10//! [regex](regex::Regex) pattern that matches lexemes of that type.
11//!
12//! Finally, the computational unit responsible for extracting the lexemes that a given input text
13//! consists of is known as a [lexical analyzer](LexicalAnalyzer), and is compiled from a set of
14//! [LexemeDescriptor]s.
15//!
16//! # Example
17//! ```rust
18//! # use syntax_parser_generator::lex::*;
19//! # use syntax_parser_generator::readers::ByteArrayReader;
20//! # use syntax_parser_generator::lex::Regex;
21//! # #[derive(Debug, Clone, Eq, Hash, PartialEq)]
22//! # enum MyLexemeType { Integer, Addition, NotANumber }
23//! let lexical_analyzer = LexicalAnalyzer::new(vec![
24//!
25//! // Integer literals
26//! LexemeDescriptor::new(
27//! MyLexemeType::Integer,
28//! Regex::concat(vec![
29//! Regex::optional(
30//! Regex::union(vec![Regex::single_char('+'), Regex::single_char('-')])
31//! ),
32//! Regex::plus_from(Regex::character_range('0', '9')),
33//! ])
34//! ),
35//!
36//! // The addition operator
37//! LexemeDescriptor::special_char(MyLexemeType::Addition, '+'),
38//!
39//! // Invalid numbers
40//! LexemeDescriptor::keyword(MyLexemeType::NotANumber, "NaN"),
41//! ]);
42//!
43//! // Use the lexical analyzer to parse structured input text
44//! let input_text = &mut ByteArrayReader::from_string(String::from("-2+NaN+-45"));
45//! let extracted_lexemes = lexical_analyzer.analyze(input_text);
46//!
47//! // Validate the parsed output
48//! let actual_lexemes = vec![
49//! Lexeme::new(MyLexemeType::Integer, "-2"),
50//! Lexeme::new(MyLexemeType::Addition, "+"),
51//! Lexeme::new(MyLexemeType::NotANumber, "NaN"),
52//! Lexeme::new(MyLexemeType::Addition, "+"),
53//! Lexeme::new(MyLexemeType::Integer, "-45"),
54//! ];
55//! assert_eq!(extracted_lexemes.collect::<Vec<Lexeme<MyLexemeType>>>(), actual_lexemes);
56//! ```
57
58pub use lexeme::{Lexeme, LexemeDescriptor};
59pub use lexical_analyzer::LexicalAnalyzer;
60pub use regex::Regex;
61
62mod regex;
63mod lexeme;
64mod lexical_analyzer;
65mod lexeme_iterator;
66
67#[cfg(test)]
68mod tests;