alkale 2.0.0

A simple LL(1) lexer library for Rust.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
//! Container module for the [`SourceCodeScanner`] type.

use core::cell::UnsafeCell;
use core::str;

use crate::span::Span;

/// u8-indexable array representing how long a utf-8 character
/// is, in bytes, based on the first byte.
///
/// `0` indicates this is an invalid start character.
const CHAR_LENGTHS: [u8; 256] = [
    // Second nibble
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 First nibble
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
    4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, // F
];

/// Shift the value 6 bits to the left and insert the lower
/// 6 bits of the byte into the now-empty space.
#[inline(always)]
const fn append_continuation_byte(base: u32, byte: u8) -> u32 {
    (base << 6) | (byte & 0b00111111) as u32
}

/// An iterator-like interface into source code. This type is not thread-safe.
///
/// This type is the backbone of all Alkale parsers, serving as the main way
/// source code is read and converted into [`Token`][crate::token::Token]s
/// and [`Notification`][crate::notification::Notification]s.
///
/// A scanner can be created simply with the [`new`][Self::new] method.
///
/// # Core Methods
/// These are the main methods used to interface with a scanner. More low-level
/// and specific versions of these methods exist, however. Such methods will be documented in
/// each core method's "See Also" section.
///
/// - [`next`][Self::next]: Consume the next character and return it.
/// - [`skip`][Self::skip]: Skip the next character.
/// - [`peek`][Self::peek]: Read the next character without consuming it.
/// - [`has_next`][Self::has_next]: Return whether more characters are left to consume.
/// - [`span`][Self::char_index]: Returns the [`Span`] the next character in the scanner.
///
/// # Additional Methods
/// If the `common` feature is not disabled, Alkale features many additional methods on
/// this type. They use the following naming conventions.
///
/// ## `try_`
/// Methods with a `try_` prefix will return an [`Option`]. If [`None`] is returned,
/// then these methods will be no-ops and consume 0 characters.
///
/// ## `consume_`
/// Methods with a `consume_` prefix will, as their name suggests, consume a section of
/// source code and return a slice to it.
///
/// ## `skip_`
/// Methods with a `skip_` prefix will continuously consume characters until EOF
/// or until a certain character type is found. They are always no-ops if already at
/// EOF or if the next character already is of the expected type.
///
/// ## `capture_`
/// Methods with a `capture_` prefix take a predicate and execute it, returning some
/// kind of metadata that would be repetitive or slow to collect manually.
///
/// ## `parse_`
/// Methods with a `parse_` prefix act similarly to `consume_` methods, but their
/// data gets processed in some way beforehand.
///
#[derive(Debug)]
pub struct SourceCodeScanner<'src> {
    /// The source byte array, this is always valid UTF-8.
    source_bytes: &'src [u8],
    /// Static pointer to the byte after the final byte in the source.
    final_ptr: *const u8,
    /// Pointer to the first byte of the next character in this stream.
    cur_char_ptr: UnsafeCell<*const u8>,
    /// Pointer to the first byte of the character *after* the next
    /// character. If `next_char_ptr == cur_char_ptr`, we must be at the end of the
    /// stream.
    next_char_ptr: UnsafeCell<*const u8>,
    /// The character pointed to by `cur_char_ptr`. If `cur_char_ptr` is equal
    /// to `next_char_ptr`, this is garbage data.
    cached_char: UnsafeCell<char>,
    /// The current char index in the source code— if 5 chars have been consumed
    /// so far, then this should be 5.
    char_index: UnsafeCell<usize>,
}

impl<'src> SourceCodeScanner<'src> {
    /// Create a new [`SourceCodeScanner`] for the given
    /// source code.
    #[must_use]
    pub fn new(source: &'src str) -> Self {
        // SAFETY: Safe because it remains in-bounds of the
        // source array. See ::add documentation.
        let final_ptr = unsafe { source.as_ptr().add(source.len()) };

        let out = Self {
            source_bytes: source.as_bytes(),
            final_ptr,
            next_char_ptr: UnsafeCell::new(source.as_ptr()),
            cur_char_ptr: UnsafeCell::new(source.as_ptr()),
            cached_char: UnsafeCell::new(' '), // Doesn't matter what char
            char_index: UnsafeCell::new(0),
        };

        out.refresh_cache();

        out
    }

    /// Align `next_char_ptr` to its proper position and refresh the char cache.
    ///
    /// This method is most useful after calls to
    /// [`skip_without_cache`][Self::skip_without_cache] to re-synchronize the
    /// scanner's character cache to resume normal usage.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// // Progress the scanner to 'B' but don't notify
    /// // it of the change.
    /// unsafe { scanner.skip_without_cache(); }
    ///
    /// // Outdated value returned.
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// // Tell the cache to refresh its state.
    /// scanner.refresh_cache();
    ///
    /// // Proper value returned now.
    /// assert_eq!(scanner.peek(), Some('B'));
    /// ```
    #[inline]
    pub fn refresh_cache(&self) {
        // SAFETY: No pointers are leaked outside of this method.
        unsafe {
            let cur_char_ptr = *self.cur_char_ptr.get();
            let next_char_ptr = self.next_char_ptr.get();

            // If no chars remain, next_char = cur_char.
            if cur_char_ptr >= self.final_ptr {
                *next_char_ptr = cur_char_ptr;
                return;
            }

            let (char, ptr) = self.read_next_char();
            *next_char_ptr = ptr;
            *self.cached_char.get() = char;
        }
    }

    /// Shift the given pointer forwards 1 byte, returning
    /// the byte that it previously pointed to.
    ///
    /// # Safety
    /// This method assumes that `ptr + 1` will not
    /// overflow [`isize::MAX`].
    #[inline(always)]
    unsafe fn take_byte(ptr: *mut *const u8) -> u8 {
        let result = **ptr;
        *ptr = (*ptr).byte_add(1);
        result
    }

    /// Read the next character in the source code.
    /// This will return the parsed character as well as a pointer to the byte
    /// after it. No state is modified.
    ///
    /// # Safety
    /// There must be at least 1 character left to consume.
    unsafe fn read_next_char(&self) -> (char, *const u8) {
        let mut ptr = *self.cur_char_ptr.get();

        let initial_byte = Self::take_byte(&mut ptr);

        // Mat: [0wwwwwww, ...]
        // Res: 0000000000000000000000000wwwwwww
        // This is an ascii char, cheap return.
        if initial_byte <= 0x7F {
            return (initial_byte as char, ptr);
        }

        // Mat: [1??wwwww, XXxxxxxx, ...]
        // Res: 000000000000000000000wwwwwxxxxxx
        // 2+ byte char
        let mut result = append_continuation_byte(
            (initial_byte & 0b00011111).into(),
            Self::take_byte(&mut ptr),
        );

        if initial_byte >= 0xE0 {
            // Mat: [111wwwww, XXxxxxxx, YYyyyyyy, ...]
            // Res: 000000000000000wwwwwxxxxxxyyyyyy
            // 3+ byte char
            result = append_continuation_byte(result, Self::take_byte(&mut ptr));

            if initial_byte >= 0xF0 {
                // Mat: [11110www, XXxxxxxx, YYyyyyyy, ZZzzzzzz, ...]
                // Res: 00000000010wwwxxxxxxyyyyyyzzzzzz
                // 4 byte char; The '1' bit above is now erroneous
                result = append_continuation_byte(result, Self::take_byte(&mut ptr));
                result &= 0b11111111_10111111_11111111_11111111;
            }
        }

        (char::from_u32_unchecked(result), ptr)
    }

    /// Same as [`read_next_char`][Self::read_next_char] but doesn't produce
    /// a [`char`] output, only a pointer. This is faster when usable.
    ///
    /// # Safety
    /// There must be at least 1 character left to consume.
    unsafe fn skip_next_char(&self) -> *const u8 {
        let ptr = *self.cur_char_ptr.get();
        let initial_byte = *ptr;
        let len = CHAR_LENGTHS.get_unchecked(initial_byte as usize);
        ptr.byte_add(*len as usize)
    }

    /// Returns `true` if at least 1 character is left to be
    /// scanned.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.has_next(), true);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.has_next(), true);
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.has_next(), false);
    /// ```
    #[inline]
    pub fn has_next(&self) -> bool {
        // SAFETY: Not being cast or exposed publicly.
        unsafe { *self.cur_char_ptr.get() < self.final_ptr }
    }

    /// Consume the next character in the source code and return it. This
    /// will advance the iterator. Returns [None] if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.next(), Some('A'));
    /// assert_eq!(scanner.next(), Some('B'));
    /// assert_eq!(scanner.next(), Some('C'));
    /// assert_eq!(scanner.next(), None);
    /// ```
    ///
    /// # See Also
    /// If no return value is needed, [`skip`][Self::skip] is more idiomatic.
    #[inline]
    #[expect(clippy::if_then_some_else_none)]
    pub fn next(&self) -> Option<char> {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let cached_char = self.cached_char.get();
                let next_char = self.next_char_ptr.get();

                // Copy the char cache to return
                let result = *cached_char;

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    let (char, ptr) = self.read_next_char();
                    *next_char = ptr;
                    *cached_char = char;
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);

                Some(result)
            } else {
                None
            }
        }
    }

    /// Skip the next character in this iterator. No-op if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), Some('C'));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    /// ```
    ///
    /// # See Also
    /// [`skip_without_cache`][Self::skip_without_cache] may be more appropriate
    /// in certain situations.
    #[inline]
    pub fn skip(&self) {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let next_char = self.next_char_ptr.get();

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    let (char, ptr) = self.read_next_char();
                    *next_char = ptr;
                    *self.cached_char.get() = char;
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);
            }
        }
    }

    /// Skip the next character in this iterator without updating the char cache.
    ///
    /// This will not update the scanner's internal char cache, thus calling [`next`][Self::next],
    /// [`peek`][Self::peek], or any other char-based methods after this will result the *wrong value*
    /// being returned.
    ///
    /// This method can be far more performant if characters are going to be collected in a
    /// non-char-by-char way, for example, with [`consume_range`][Self::consume_range].
    ///
    /// # Safety
    /// Ensure that no char-collecting methods are called after this.
    ///
    /// # Recovery
    /// After calling this method, the scanner can be considered to be in an *invalid state*.
    /// The [`skip`][Self::skip] and [`refresh_cache`][Self::refresh_cache] methods are the primary
    /// ways of restoring the scanner back to a valid state.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// // Progress the scanner to 'B' but don't notify
    /// // it of the change.
    /// unsafe { scanner.skip_without_cache(); }
    ///
    /// // Outdated value returned.
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// // Tell the cache to refresh its state.
    /// scanner.refresh_cache();
    ///
    /// // Proper value returned now.
    /// assert_eq!(scanner.peek(), Some('B'));
    /// ```
    ///
    /// # See Also:
    /// If the cache needs to be immidiately updated, use [`skip`][Self::skip] instead.
    #[inline]
    pub unsafe fn skip_without_cache(&self) {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let next_char = self.next_char_ptr.get();

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    *next_char = self.skip_next_char();
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);
            }
        }
    }

    /// Peek the next character in the source code without progressing the iterator.
    /// Returns [`None`] if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), Some('C'));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    /// ```
    #[inline]
    pub fn peek(&self) -> Option<char> {
        // SAFETY: Char is returned copied, no ref leaked.
        unsafe { self.has_next().then(|| *self.cached_char.get()) }
    }

    /// Get the current character index of this stream.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("Aµc");
    ///
    /// assert_eq!(scanner.char_index(), 0);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 1);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 2);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 3);
    /// ```
    ///
    /// # See Also
    /// Often it is easier to work with [`Span`]s than raw character indices.
    /// Use [`span`][Self::span] to get a [`Span`] over the next character in the
    /// scanner.
    #[inline]
    pub fn char_index(&self) -> usize {
        // SAFETY: Type is returned by-value via Copy.
        unsafe { *self.char_index.get() }
    }

    /// Execute the input predicate, the range of source code that was consumed
    /// is returned as a [`prim@str`] reference.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("abcdef");
    ///
    /// let capture = scanner.consume_range(|| {
    ///     scanner.skip();
    ///     scanner.skip();
    ///     scanner.skip();
    /// });
    ///
    /// assert_eq!(capture, "abc");
    /// ```
    ///
    /// # See Also
    /// If the `common` feature is not disabled, other `consume_` methods are
    /// generally preferred over this.
    #[inline]
    pub fn consume_range<F: FnOnce()>(&self, predicate: F) -> &'src str {
        // SAFETY: As long as `predicate` only performs valid operations
        // on the underlying bytestream, this is valid. The user will need
        // `unsafe` code to violate this contract regardless.
        unsafe {
            let initial_pointer = *self.cur_char_ptr.get();

            predicate();

            let final_pointer = *self.cur_char_ptr.get();

            let arr_ptr = self.source_bytes.as_ptr();
            let initial_index = initial_pointer.byte_offset_from(arr_ptr) as usize;
            let final_index = final_pointer.byte_offset_from(arr_ptr) as usize;
            str::from_utf8_unchecked(self.source_bytes.get_unchecked(initial_index..final_index))
        }
    }

    /// Get a [`Span`] of the character currently pointed to by this reader. (That is, the
    /// character returned by [`next`][Self::next]) The length of this span is usually `1`.
    ///
    /// If no next character exists, this [`Span`] will be the index *after* the last character, with a
    /// length of `0`.
    ///
    /// This method may be useful when a position in the source code needs to be "saved" for
    /// later reference. For example, at the beginning of a multi-character token like a number.
    /// When using it for this purpose, [`range`][Span::range] or [`up_to`][Span::up_to] may be helpful.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// # unsafe {
    /// assert_eq!(scanner.span(), Span::new(0, 1));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(2, 1));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(3, 0));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(3, 0));
    /// # }
    /// ```
    #[must_use]
    #[inline]
    pub fn span(&self) -> Span {
        // SAFETY: char_index is always in bounds. Length
        // is set appropriately
        unsafe {
            if self.has_next() {
                Span::new(self.char_index(), 1)
            } else {
                Span::new(self.char_index(), 0)
            }
        }
    }
}