alkale/
lib.rs

1//! Alkale is a library focused on streamlining the production of hand-written lexers.
2//!
3//! A lexer, generally speaking, is a function that converts source code into
4//! a [`FinalizedLexerResult`].
5//!
6//! A typical lexer function will look something like this.
7//! ```rust
8//! use alkale::{SourceCodeScanner, LexerResult, FinalizedLexerResult};
9//!
10//! enum MyTokenData {
11//!     // ...
12//! }
13//!
14//! fn lexer(source: &str) -> FinalizedLexerResult<MyTokenData> {
15//!     // This will serve as an interface into the code for processing.
16//!     let scanner = SourceCodeScanner::new(source);
17//!
18//!     // This serves as a collection of our produced tokens and notifications.
19//!     let mut result = LexerResult::new();
20//!
21//!     while scanner.has_next() {
22//!         // Main body goes here, processing the scanner into
23//!         // tokens and notifications to be passed into the result.
24//!     }
25//!
26//!     // Finalize the result and return it.
27//!     result.finalize()
28//! }
29//! ```
30//!
31//! Many methods exist on [`SourceCodeScanner`] to consume the source code in various ways,
32//! see its documentation for more details. Regardless, valid data from the source code should be
33//! converted into [`Token`]s, and invalid data into [`Notification`]s, both to be reported to the
34//! [`LexerResult`]. These four datatypes, as well as [`Span`][crate::span::Span] (used to create
35//! [`Token`]s) pretty much make up the backbone of every single lexer.
36//!
37//! # Features
38//! Alkale has a single feature, `common`. This feature is enabled by default and
39//! introduces a huge amount of helper methods to [`SourceCodeScanner`] for things
40//! such as number parsing, strings, identifiers, etc.
41//!
42//! # Example
43//! Here is an example of a simple lexer that tokenizes words in the program,
44//! ignoring whitespace and throwing an error for everything else.
45//!
46//! ```rust
47//! use alkale::{
48//!     format_notification, notification::NotificationSeverity, token::Token, FinalizedLexerResult,
49//!     LexerResult, SourceCodeScanner,
50//! };
51//!
52//! type Word<'a> = &'a str;
53//!
54//! fn lexer(source: &str) -> FinalizedLexerResult<Word<'_>> {
55//!     let scanner = SourceCodeScanner::new(source);
56//!     let mut result = LexerResult::new();
57//!
58//!     while scanner.has_next() {
59//!         // Try to parse out a word.
60//!         if let Some(identifier) = scanner.try_consume_standard_identifier() {
61//!             // We found a word, push it and restart the loop.
62//!             result.push_token(Token::from_spanned(identifier));
63//!             continue;
64//!         }
65//!
66//!         // No word was found, consume one character.
67//!         if let Some(char) = scanner.next_span() {
68//!             // If this character wasn't whitespace (i.e. illegal char) then
69//!             // report a notification.
70//!             if !char.is_whitespace() {
71//!                 format_notification!("Unrecognized character '{}'", char.data)
72//!                     .span(char.span)
73//!                     .severity(NotificationSeverity::Error)
74//!                     .report(&mut result);
75//!             }
76//!         }
77//!     }
78//!
79//!     result.finalize()
80//! }
81//! ```
82//!
83//! This example should give a basic overview of what the main loop of a lexer should look like.
84//! Check for a pattern, if it was found, parse it into a token and reset the loop. If the pattern wasn't
85//! found, continue onto the next pattern until you reach a base case.
86
87#![cfg_attr(docsrs, feature(doc_auto_cfg))]
88
89use core::fmt::Debug;
90
91use notification::{Notification, NotificationAcceptor, NotificationList};
92use token::Token;
93
94#[cfg(feature = "common")]
95pub mod common;
96pub mod notification;
97mod scanner;
98pub mod span;
99pub mod token;
100
101pub use scanner::SourceCodeScanner;
102
103/// Used to accumulate [`Notification`]s and [`Token`]s during lexing.
104///
105/// Once lexing is complete, [`finalize`][Self::finalize] can convert this
106/// into a [`FinalizedLexerResult`] to be handled as appropriate.
107#[derive(Debug)]
108pub struct LexerResult<TokenData, N = ()> {
109    /// The notifications that were reported by the lexer.
110    notifications: NotificationList<N>,
111    /// The vec of tokens generated by the lexer.
112    tokens: Vec<Token<TokenData>>,
113}
114
115impl<T, N> NotificationAcceptor<N> for LexerResult<T, N> {
116    #[inline]
117    fn report(&mut self, notification: Notification<N>) {
118        self.notifications.push(notification);
119    }
120}
121
122impl<T, N> Default for LexerResult<T, N> {
123    fn default() -> Self {
124        Self {
125            notifications: NotificationList::new(),
126            tokens: vec![],
127        }
128    }
129}
130
131impl<T, N> LexerResult<T, N> {
132    /// Create a new [`LexerResult`].
133    #[must_use]
134    pub const fn new() -> Self {
135        Self {
136            notifications: NotificationList::new(),
137            tokens: vec![],
138        }
139    }
140
141    /// Push a token to the internal token list.
142    #[inline]
143    pub fn push_token(&mut self, token: Token<T>) {
144        self.tokens.push(token);
145    }
146
147    /// Convert this [`LexerResult`] into a [`FinalizedLexerResult`], containing the
148    /// [`Token`] list as well as any generated [`Notification`]s.
149    #[inline]
150    #[must_use]
151    pub fn finalize(self) -> FinalizedLexerResult<T, N> {
152        FinalizedLexerResult {
153            notifications: self.notifications,
154            tokens: self.tokens,
155        }
156    }
157}
158
159/// The final result of a lexer. This is returned by
160/// a [`LexerResult`]'s [`finalize`][LexerResult::finalize]
161/// method.
162#[derive(Debug)]
163pub struct FinalizedLexerResult<TokenData, N = ()> {
164    /// The notifications that were reported by the lexer.
165    notifications: NotificationList<N>,
166    /// The vec of tokens generated by the lexer.
167    tokens: Vec<Token<TokenData>>,
168}
169
170impl<T, N> FinalizedLexerResult<T, N> {
171    /// Get this result's [`NotificationList`], containing all reported notifications.
172    #[inline]
173    #[must_use]
174    pub const fn notifications(&self) -> &NotificationList<N> {
175        &self.notifications
176    }
177
178    /// Get this result's list of tokens. This method will return [None] if at least 1
179    /// reported notification is severity [`Error`][notification::NotificationSeverity::Error] or higher
180    #[inline]
181    #[must_use]
182    pub fn tokens(&self) -> Option<&Vec<Token<T>>> {
183        self.notifications.is_valid().then_some(&self.tokens)
184    }
185
186    /// Returns a 2-tuple containing this result's [`NotificationList`] and tokens.
187    ///
188    /// If the notification list contains at least 1 notification of severity error or higher, then
189    /// this pair will contain no tokens. ([`None`])
190    #[inline]
191    #[must_use]
192    pub fn into_pair(self) -> (NotificationList<N>, Option<Vec<Token<T>>>) {
193        let valid = self.notifications.is_valid();
194
195        (self.notifications, valid.then_some(self.tokens))
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use crate::{span::Span, token::Token, LexerResult, SourceCodeScanner};
202
203    pub fn common() -> SourceCodeScanner<'static> {
204        SourceCodeScanner::new("Testing.")
205    }
206
207    #[test]
208    fn next() {
209        let code = common();
210
211        assert_eq!(code.next(), Some('T'));
212        assert_eq!(code.next(), Some('e'));
213        assert_eq!(code.next(), Some('s'));
214        assert_eq!(code.next(), Some('t'));
215        assert_eq!(code.next(), Some('i'));
216        assert_eq!(code.next(), Some('n'));
217        assert_eq!(code.next(), Some('g'));
218        assert_eq!(code.next(), Some('.'));
219        assert_eq!(code.next(), None);
220    }
221
222    #[test]
223    fn next_span() {
224        let code = common();
225
226        // SAFETY: Spans are all valid.
227        unsafe {
228            assert_eq!(code.next_span(), Some(Span::new(0, 1).wrap('T')));
229            assert_eq!(code.next_span(), Some(Span::new(1, 1).wrap('e')));
230            assert_eq!(code.next_span(), Some(Span::new(2, 1).wrap('s')));
231            assert_eq!(code.next_span(), Some(Span::new(3, 1).wrap('t')));
232            assert_eq!(code.next_span(), Some(Span::new(4, 1).wrap('i')));
233            assert_eq!(code.next_span(), Some(Span::new(5, 1).wrap('n')));
234            assert_eq!(code.next_span(), Some(Span::new(6, 1).wrap('g')));
235            assert_eq!(code.next_span(), Some(Span::new(7, 1).wrap('.')));
236            assert_eq!(code.next_span(), None);
237        }
238    }
239
240    #[test]
241    fn peek() {
242        let code = common();
243
244        assert_eq!(code.peek(), Some('T'));
245        assert_eq!(code.next(), Some('T'));
246        assert_eq!(code.next(), Some('e'));
247        assert_eq!(code.next(), Some('s'));
248        assert_eq!(code.next(), Some('t'));
249        assert_eq!(code.peek(), Some('i'));
250        assert_eq!(code.next(), Some('i'));
251        assert_eq!(code.next(), Some('n'));
252        assert_eq!(code.next(), Some('g'));
253        assert_eq!(code.peek(), Some('.'));
254        assert_eq!(code.next(), Some('.'));
255        assert_eq!(code.peek(), None);
256        assert_eq!(code.next(), None);
257        assert_eq!(code.peek(), None);
258    }
259
260    #[test]
261    fn has_next() {
262        let code = common();
263
264        assert_eq!(code.next(), Some('T'));
265        assert_eq!(code.next(), Some('e'));
266        assert!(code.has_next());
267        assert_eq!(code.next(), Some('s'));
268        assert_eq!(code.next(), Some('t'));
269        assert!(code.has_next());
270        assert_eq!(code.next(), Some('i'));
271        assert_eq!(code.next(), Some('n'));
272        assert_eq!(code.next(), Some('g'));
273        assert!(code.has_next());
274        assert_eq!(code.next(), Some('.'));
275        assert!(!code.has_next());
276        assert_eq!(code.next(), None);
277        assert!(!code.has_next());
278    }
279
280    #[test]
281    fn skip() {
282        let code = common();
283
284        assert_eq!(code.next(), Some('T'));
285        assert_eq!(code.next(), Some('e'));
286        code.skip();
287        assert_eq!(code.next(), Some('t'));
288        assert_eq!(code.next(), Some('i'));
289        code.skip();
290        assert_eq!(code.next(), Some('g'));
291        code.skip();
292        assert_eq!(code.next(), None);
293    }
294
295    #[test]
296    fn peek_is() {
297        let code = common();
298
299        assert_eq!(code.next(), Some('T'));
300        assert_eq!(code.next(), Some('e'));
301        assert!(code.peek_is('s'));
302        assert_eq!(code.next(), Some('s'));
303        assert_eq!(code.next(), Some('t'));
304        assert_eq!(code.next(), Some('i'));
305        assert!(code.peek_is('n'));
306        assert_eq!(code.next(), Some('n'));
307        assert!(code.peek_is_not('P'));
308        assert_eq!(code.next(), Some('g'));
309        assert!(code.peek_is('.'));
310        assert_eq!(code.next(), Some('.'));
311        assert!(code.peek_is_not('P'));
312        assert_eq!(code.next(), None);
313        assert!(code.peek_is_not('P'));
314    }
315
316    #[test]
317    fn peek_is_map() {
318        let code = common();
319
320        assert!(code.peek_is_map(|x| x.is_uppercase()));
321        assert_eq!(code.next(), Some('T'));
322        assert_eq!(code.next(), Some('e'));
323        assert!(!code.peek_is_map(|x| x.is_uppercase()));
324        assert_eq!(code.next(), Some('s'));
325        assert_eq!(code.next(), Some('t'));
326        assert_eq!(code.next(), Some('i'));
327        assert!(code.peek_is_map(|x| x.is_lowercase()));
328        assert_eq!(code.next(), Some('n'));
329        assert_eq!(code.next(), Some('g'));
330        assert!(!code.peek_is_map(|x| x.is_lowercase()));
331        assert_eq!(code.next(), Some('.'));
332        assert!(!code.peek_is_map(|_| true));
333        assert_eq!(code.next(), None);
334        assert!(!code.peek_is_map(|_| true));
335    }
336
337    #[test]
338    fn span() {
339        let code = common();
340
341        // SAFETY: Spans are all valid.
342        unsafe {
343            assert_eq!(code.next(), Some('T'));
344            assert_eq!(code.next(), Some('e'));
345            assert_eq!(code.span(), Span::new_single(2));
346            assert_eq!(code.next(), Some('s'));
347            assert_eq!(code.next(), Some('t'));
348            assert_eq!(code.next(), Some('i'));
349            assert_eq!(code.span(), Span::new_single(5));
350            assert_eq!(code.next(), Some('n'));
351            assert_eq!(code.next(), Some('g'));
352            assert_eq!(code.span(), Span::new_single(7));
353            assert_eq!(code.span(), Span::new_single(7));
354            assert_eq!(code.next(), Some('.'));
355            assert_eq!(code.span(), Span::new_empty(8));
356            assert_eq!(code.next(), None);
357            assert_eq!(code.span(), Span::new_empty(8));
358        }
359    }
360
361    // push_token() and result()
362    #[test]
363    fn meta() {
364        let mut result = LexerResult::<_, ()>::new();
365
366        // SAFETY: Spans are all valid.
367        unsafe {
368            let bloo = Token::new("bloo", Span::new(0, 4));
369            let ploo = Token::new("ploo", Span::new(4, 4));
370
371            result.push_token(bloo.clone());
372            result.push_token(ploo.clone());
373
374            let result = result.finalize();
375
376            assert_eq!(result.notifications.into_sorted_vec(), vec![]);
377
378            assert_eq!(result.tokens, vec![bloo, ploo]);
379        }
380    }
381}