scnr/lib.rs
1#![forbid(missing_docs)]
2//! # `scnr`
3//! The `scnr` crate is a library that provides lexical scanner for programming languages.
4//! It is designed to be used in a parser of a compiler or interpreter for a programming language
5//! or in similar tools that require lexical analysis, e.g. in a language server.
6//! It provides multiple scanner modes out of the box, which can be switched at runtime depending
7//! on the context of the input.
8//! A parser can use different modes for different parts of the input, e.g. to scan comments in one
9//! mode and code in another.
10//! The scanner is designed to be fast and efficient, and it is implemented with the help of
11//! finite state machines.
12//! To parse the given regular expressions, the crate uses the `regex-syntax` crate.
13//!
14//! # Example with a simple pattern list
15//! ```rust
16//! use scnr::ScannerBuilder;
17//!
18//! static PATTERNS: &[&str] = &[
19//! r";", // Semicolon
20//! r"0|[1-9][0-9]*", // Number
21//! r"//.*(\r\n|\r|\n)", // Line comment
22//! r"/\*([^*]|\*[^/])*\*/", // Block comment
23//! r"[a-zA-Z_]\w*", // Identifier
24//! r"=", // Assignment
25//! ];
26//!
27//! const INPUT: &str = r#"
28//! // This is a comment
29//! a = 10;
30//! b = 20;
31//! /* This is a block comment
32//! that spans multiple lines */
33//! c = a;
34//! "#;
35//!
36//! fn main() {
37//! let scanner = ScannerBuilder::new()
38//! .add_patterns(PATTERNS)
39//! .build()
40//! .expect("ScannerBuilder error");
41//! let find_iter = scanner.find_iter(INPUT);
42//! for ma in find_iter {
43//! println!("Match: {:?}: '{}'", ma, &INPUT[ma.span().range()]);
44//! }
45//! }
46//! ```
47//! The output of the example is:
48//! ```text
49//! Match: Match { token_type: 2, span: Span { start: 1, end: 22 } }: '// This is a comment
50//! '
51//! Match: Match { token_type: 4, span: Span { start: 22, end: 23 } }: 'a'
52//! Match: Match { token_type: 5, span: Span { start: 24, end: 25 } }: '='
53//! Match: Match { token_type: 1, span: Span { start: 26, end: 28 } }: '10'
54//! Match: Match { token_type: 0, span: Span { start: 28, end: 29 } }: ';'
55//! Match: Match { token_type: 4, span: Span { start: 30, end: 31 } }: 'b'
56//! Match: Match { token_type: 5, span: Span { start: 32, end: 33 } }: '='
57//! Match: Match { token_type: 1, span: Span { start: 34, end: 36 } }: '20'
58//! Match: Match { token_type: 0, span: Span { start: 36, end: 37 } }: ';'
59//! Match: Match { token_type: 3, span: Span { start: 38, end: 96 } }: '/* This is a block comment
60//! that spans multiple lines */'
61//! Match: Match { token_type: 4, span: Span { start: 97, end: 98 } }: 'c'
62//! Match: Match { token_type: 5, span: Span { start: 99, end: 100 } }: '='
63//! Match: Match { token_type: 4, span: Span { start: 101, end: 102 } }: 'a'
64//! Match: Match { token_type: 0, span: Span { start: 102, end: 103 } }: ';'
65//! ```
66//!
67//! # Example with scanner modes and position information
68//! ```rust
69//! use std::sync::LazyLock;
70//!
71//! use scnr::{MatchExtIterator, Pattern, ScannerBuilder, ScannerMode};
72//!
73//! static SCANNER_MODES: LazyLock<Vec<ScannerMode>> = LazyLock::new(|| {
74//! vec![
75//! ScannerMode::new(
76//! "INITIAL",
77//! vec![
78//! Pattern::new(r"\r\n|\r|\n".to_string(), 0), // Newline
79//! Pattern::new(r"[a-zA-Z_]\w*".to_string(), 4), // Identifier
80//! Pattern::new(r#"""#.to_string(), 6), // String delimiter
81//! ],
82//! vec![
83//! (6, 1), // Token "String delimiter" -> Mode "STRING"
84//! ],
85//! ),
86//! ScannerMode::new(
87//! "STRING",
88//! vec![
89//! Pattern::new(r#"""#.to_string(), 6), // String delimiter
90//! Pattern::new(r#"[^"]+"#.to_string(), 5), // String content
91//! ],
92//! vec![
93//! (6, 0), // Token "String delimiter" -> Mode "INITIAL"
94//! ],
95//! ),
96//! ]
97//! });
98//!
99//! const INPUT: &str = r#"Id1 "1. String" "2. String""#;
100//!
101//! fn main() {
102//! let scanner = ScannerBuilder::new()
103//! .add_scanner_modes(&SCANNER_MODES)
104//! .build()
105//! .expect("ScannerBuilder error");
106//! let find_iter = scanner.find_iter(INPUT).with_positions();
107//! for ma in find_iter {
108//! println!("{:?}: '{}'", ma, &INPUT[ma.span().range()]);
109//! }
110//! }
111//! ```
112//!
113//! The output of this example is:
114//! ```text
115//! MatchExt { token_type: 4, span: Span { start: 0, end: 3 }, start_position: Position { line: 1, column: 1 }, end_position: Position { line: 1, column: 4 } }: 'Id1'
116//! MatchExt { token_type: 6, span: Span { start: 4, end: 5 }, start_position: Position { line: 1, column: 5 }, end_position: Position { line: 1, column: 6 } }: '"'
117//! MatchExt { token_type: 5, span: Span { start: 5, end: 14 }, start_position: Position { line: 1, column: 6 }, end_position: Position { line: 1, column: 15 } }: '1. String'
118//! MatchExt { token_type: 6, span: Span { start: 14, end: 15 }, start_position: Position { line: 1, column: 15 }, end_position: Position { line: 1, column: 16 } }: '"'
119//! MatchExt { token_type: 6, span: Span { start: 16, end: 17 }, start_position: Position { line: 1, column: 17 }, end_position: Position { line: 1, column: 18 } }: '"'
120//! MatchExt { token_type: 5, span: Span { start: 17, end: 26 }, start_position: Position { line: 1, column: 18 }, end_position: Position { line: 1, column: 27 } }: '2. String'
121//! MatchExt { token_type: 6, span: Span { start: 26, end: 27 }, start_position: Position { line: 1, column: 27 }, end_position: Position { line: 1, column: 28 } }: '"'
122//! ```
123//!
124//! # Crate features
125//! The crate has the following features:
126//! - `default`: This is the default feature set. When it is enabled it uses the `scnr` crate's own
127//! regex engine.
128//!
129//! - `regex_automata`: This feature is not enabled by default. It instructs the lib to use the
130//! crate `regex_automata` as regex engine.
131//!
132//! Both features are mutually exclusive. You can enable one of them, but not both at the same time.
133//!
134//! Enabling the default feature usually results in a slower scanner, but it is faster at compiling
135//! the regexes. The `regex_automata` feature is faster at scanning the input, but it is possibly
136//! slower at compiling the regexes. This depends on the size of your scanner modes, i.e. the number
137//! of regexes you use.
138
139/// Module with error definitions
140mod errors;
141pub use errors::{Result, ScnrError, ScnrErrorKind};
142
143/// Module that provides a FindMatches type
144mod find_matches;
145pub use find_matches::{FindMatches, PeekResult};
146
147/// The module with internal implementation details.
148mod internal;
149
150/// Module that provides a Match type
151mod match_type;
152pub use match_type::{Match, MatchExt};
153
154/// Module that provides a Pattern type and a Lookahead type
155mod pattern;
156pub use pattern::{Lookahead, Pattern};
157
158/// Module that provides a position type
159mod position;
160pub use position::{Position, PositionProvider};
161
162/// The module with the scanner.
163mod scanner;
164pub use scanner::{Scanner, ScannerModeSwitcher};
165
166/// The module with the scanner builder.
167mod scanner_builder;
168pub use scanner_builder::ScannerBuilder;
169
170/// The module with the scanner mode.
171mod scanner_mode;
172pub use scanner_mode::ScannerMode;
173
174/// Module that provides a Span type
175mod span;
176pub use span::Span;
177
178/// Module that provides a WithPositions type
179mod with_positions;
180pub use with_positions::{MatchExtIterator, WithPositions};