lady_deirdre/lexis/
buffer.rs

1////////////////////////////////////////////////////////////////////////////////
2// This file is part of "Lady Deirdre", a compiler front-end foundation       //
3// technology.                                                                //
4//                                                                            //
5// This work is proprietary software with source-available code.              //
6//                                                                            //
7// To copy, use, distribute, or contribute to this work, you must agree to    //
8// the terms of the General License Agreement:                                //
9//                                                                            //
10// https://github.com/Eliah-Lakhin/lady-deirdre/blob/master/EULA.md           //
11//                                                                            //
12// The agreement grants a Basic Commercial License, allowing you to use       //
13// this work in non-commercial and limited commercial products with a total   //
14// gross revenue cap. To remove this commercial limit for one of your         //
15// products, you must acquire a Full Commercial License.                      //
16//                                                                            //
17// If you contribute to the source code, documentation, or related materials, //
18// you must grant me an exclusive license to these contributions.             //
19// Contributions are governed by the "Contributions" section of the General   //
20// License Agreement.                                                         //
21//                                                                            //
22// Copying the work in parts is strictly forbidden, except as permitted       //
23// under the General License Agreement.                                       //
24//                                                                            //
25// If you do not or cannot agree to the terms of this Agreement,              //
26// do not use this work.                                                      //
27//                                                                            //
28// This work is provided "as is", without any warranties, express or implied, //
29// except where such disclaimers are legally invalid.                         //
30//                                                                            //
31// Copyright (c) 2024 Ilya Lakhin (Илья Александрович Лахин).                 //
32// All rights reserved.                                                       //
33////////////////////////////////////////////////////////////////////////////////
34
35use std::{
36    borrow::Cow,
37    fmt::{Debug, Display, Formatter},
38    iter::{FusedIterator, Peekable, Take},
39    slice::Iter,
40    str::Chars,
41};
42
43use crate::{
44    arena::{Entry, Id, Identifiable},
45    format::SnippetFormatter,
46    lexis::{
47        cursor::TokenBufferCursor,
48        session::{BufferLexisSession, Cursor},
49        ByteIndex,
50        Chunk,
51        Length,
52        LineIndex,
53        Site,
54        SourceCode,
55        ToSpan,
56        Token,
57        TokenCount,
58        CHUNK_SIZE,
59    },
60    report::{ld_assert, ld_unreachable},
61};
62
63/// A growable buffer of tokens.
64///
65/// This object provides canonical implementation of the [SourceCode] trait,
66/// specifically optimized for one-time scanning of source code text.
67///
68/// TokenBuffer parses the lexical structure of the source code and allows
69/// appending text to the buffer, but it does not support incremental reparsing
70/// when arbitrary ranges of the source code are edited.
71///
72/// If a compilation unit with incremental reparsing capabilities is required
73/// but a syntax tree parser is not, consider using the mutable
74/// [Document](crate::units::Document) or [MutableUnit](crate::units::MutableUnit)
75/// with the node type set to `VoidSyntax<T>` (see
76/// [VoidSyntax](crate::syntax::VoidSyntax)).
77///
78/// If a stateful buffer with random-access capabilities is not needed, consider
79/// using one of the [stateless lexical scanners](crate::lexis::Scannable)
80/// instead.
81pub struct TokenBuffer<T: Token> {
82    id: Id,
83    pub(crate) tokens: Vec<T>,
84    pub(super) sites: Vec<Site>,
85    pub(crate) spans: Vec<Length>,
86    pub(crate) indices: Vec<ByteIndex>,
87    pub(crate) text: String,
88    pub(crate) lines: LineIndex,
89}
90
91impl<T: Token> Debug for TokenBuffer<T> {
92    #[inline]
93    fn fmt(&self, formatter: &mut Formatter) -> std::fmt::Result {
94        formatter
95            .debug_struct("TokenBuffer")
96            .field("id", &self.id)
97            .field("length", &self.length())
98            .finish_non_exhaustive()
99    }
100}
101
102impl<T: Token> Display for TokenBuffer<T> {
103    #[inline(always)]
104    fn fmt(&self, formatter: &mut Formatter) -> std::fmt::Result {
105        formatter
106            .snippet(self)
107            .set_caption(format!("TokenBuffer({})", self.id))
108            .finish()
109    }
110}
111
112impl<T: Token, S: AsRef<str>> From<S> for TokenBuffer<T> {
113    #[inline(always)]
114    fn from(string: S) -> Self {
115        let string = string.as_ref();
116
117        let token_capacity = (string.len() / CHUNK_SIZE + 1).next_power_of_two();
118
119        let mut buffer = TokenBuffer::with_capacity(token_capacity, string.len());
120
121        buffer.append(string);
122
123        buffer
124    }
125}
126
127impl<T: Token> Drop for TokenBuffer<T> {
128    fn drop(&mut self) {
129        self.id.clear_name();
130    }
131}
132
133impl<T: Token> Identifiable for TokenBuffer<T> {
134    #[inline(always)]
135    fn id(&self) -> Id {
136        self.id
137    }
138}
139
140impl<T: Token> SourceCode for TokenBuffer<T> {
141    type Token = T;
142
143    type Cursor<'code> = TokenBufferCursor<'code, Self::Token>;
144
145    type CharIterator<'code> = Take<Chars<'code>>;
146
147    fn chars(&self, span: impl ToSpan) -> Self::CharIterator<'_> {
148        let span = match span.to_site_span(self) {
149            None => panic!("Specified span is invalid."),
150            Some(span) => span,
151        };
152
153        let span_length = span.end - span.start;
154
155        if span_length == 0 {
156            return "".chars().take(0);
157        };
158
159        let rest = match self.search(span.start) {
160            Ok(byte_index) => {
161                ld_assert!(byte_index < self.text.len(), "Byte index out of bounds.");
162
163                unsafe { self.text.get_unchecked(byte_index..) }.chars()
164            }
165
166            Err((byte_index, mut remaining)) => {
167                ld_assert!(byte_index < self.text.len(), "Byte index out of bounds.");
168
169                let mut rest = unsafe { self.text.get_unchecked(byte_index..) }.chars();
170
171                while remaining > 0 {
172                    remaining -= 1;
173                    let _ = rest.next();
174                }
175
176                rest
177            }
178        };
179
180        rest.take(span_length)
181    }
182
183    fn substring(&self, span: impl ToSpan) -> Cow<str> {
184        let span = match span.to_site_span(self) {
185            None => panic!("Specified span is invalid."),
186            Some(span) => span,
187        };
188
189        let span_length = span.end - span.start;
190
191        if span_length == 0 {
192            return Cow::from("");
193        };
194
195        let start = match self.search(span.start) {
196            Ok(byte_index) => byte_index,
197
198            Err((byte_index, remaining)) => {
199                ld_assert!(byte_index < self.text.len(), "Byte index out of bounds.");
200
201                let rest = unsafe { self.text.get_unchecked(byte_index..) }.char_indices();
202
203                let Some((offset, _)) = rest.take(remaining + 1).last() else {
204                    unsafe { ld_unreachable!("Empty tail.") };
205                };
206
207                byte_index + offset
208            }
209        };
210
211        let end = match self.search(span.end) {
212            Ok(byte_index) => byte_index,
213
214            Err((byte_index, remaining)) => {
215                ld_assert!(byte_index < self.text.len(), "Byte index out of bounds.");
216
217                let rest = unsafe { self.text.get_unchecked(byte_index..) }.char_indices();
218
219                let Some((offset, _)) = rest.take(remaining + 1).last() else {
220                    unsafe { ld_unreachable!("Empty tail.") };
221                };
222
223                byte_index + offset
224            }
225        };
226
227        ld_assert!(start <= end, "Invalid byte bounds.");
228        ld_assert!(end <= self.text.len(), "Invalid byte bounds.");
229
230        unsafe { Cow::from(self.text.get_unchecked(start..end)) }
231    }
232
233    #[inline(always)]
234    fn has_chunk(&self, entry: &Entry) -> bool {
235        if entry.version > 0 {
236            return false;
237        }
238
239        entry.index < self.tokens()
240    }
241
242    #[inline(always)]
243    fn get_token(&self, entry: &Entry) -> Option<Self::Token> {
244        if entry.version > 0 {
245            return None;
246        }
247
248        self.tokens.get(entry.index).copied()
249    }
250
251    #[inline(always)]
252    fn get_site(&self, entry: &Entry) -> Option<Site> {
253        if entry.version > 0 {
254            return None;
255        }
256
257        self.sites.get(entry.index).copied()
258    }
259
260    #[inline(always)]
261    fn get_string(&self, entry: &Entry) -> Option<&str> {
262        if entry.version > 0 {
263            return None;
264        }
265
266        let start = *self.indices.get(entry.index)?;
267
268        let end = self
269            .indices
270            .get(entry.index + 1)
271            .copied()
272            .unwrap_or(self.text.len());
273
274        Some(unsafe { self.text.get_unchecked(start..end) })
275    }
276
277    #[inline(always)]
278    fn get_length(&self, entry: &Entry) -> Option<Length> {
279        if entry.version > 0 {
280            return None;
281        }
282
283        self.spans.get(entry.index).copied()
284    }
285
286    #[inline(always)]
287    fn cursor(&self, span: impl ToSpan) -> TokenBufferCursor<'_, Self::Token> {
288        let span = match span.to_site_span(self) {
289            None => panic!("Specified span is invalid."),
290            Some(span) => span,
291        };
292
293        Self::Cursor::new(self, span)
294    }
295
296    #[inline(always)]
297    fn length(&self) -> Length {
298        self.lines.code_length()
299    }
300
301    #[inline(always)]
302    fn tokens(&self) -> TokenCount {
303        self.tokens.len()
304    }
305
306    #[inline(always)]
307    fn lines(&self) -> &LineIndex {
308        &self.lines
309    }
310}
311
312impl<T: Token> Default for TokenBuffer<T> {
313    #[inline]
314    fn default() -> Self {
315        Self::new()
316    }
317}
318
319impl<'buffer, T: Token> IntoIterator for &'buffer TokenBuffer<T> {
320    type Item = <Self::IntoIter as Iterator>::Item;
321    type IntoIter = TokenBufferIter<'buffer, T>;
322
323    #[inline(always)]
324    fn into_iter(self) -> Self::IntoIter {
325        Self::IntoIter {
326            site: 0,
327            tokens: self.tokens.iter(),
328            spans: self.spans.iter(),
329            indices: self.indices.iter().peekable(),
330            text: self.text.as_str(),
331        }
332    }
333}
334
335impl<T: Token> TokenBuffer<T> {
336    /// Creates a TokenBuffer by scanning the specified source code `text`.
337    #[inline(always)]
338    pub fn parse(text: impl AsRef<str>) -> Self {
339        Self::from(text)
340    }
341
342    /// Creates an empty TokenBuffer.
343    ///
344    /// You can append and scan the source code text later using
345    /// the [append](Self::append) function.
346    #[inline(always)]
347    pub fn new() -> Self {
348        Self {
349            id: Id::new(),
350            tokens: Vec::new(),
351            sites: Vec::new(),
352            spans: Vec::new(),
353            indices: Vec::new(),
354            text: String::new(),
355            lines: LineIndex::new(),
356        }
357    }
358
359    /// Creates an empty TokenBuffer but preallocates memory to store
360    /// at least `tokens` number of the source code tokens, and the `text`
361    /// number of bytes to store the source code text.
362    #[inline(always)]
363    pub fn with_capacity(tokens: TokenCount, text: usize) -> Self {
364        Self {
365            id: Id::new(),
366            tokens: Vec::with_capacity(tokens),
367            sites: Vec::with_capacity(tokens),
368            spans: Vec::with_capacity(tokens),
369            indices: Vec::with_capacity(tokens),
370            text: String::with_capacity(text),
371            lines: LineIndex::with_capacity(text),
372        }
373    }
374
375    /// Appends the `text` to the end of the buffer by rescanning the tail of
376    /// the former buffer (if non-empty) and the appendable text.
377    pub fn append(&mut self, text: impl AsRef<str>) {
378        let text = text.as_ref();
379
380        if text.is_empty() {
381            return;
382        }
383
384        let mut byte = self.text.len();
385        let mut site = self.length();
386
387        let mut lookback = 0;
388
389        while lookback < T::LOOKBACK {
390            let Some(span) = self.spans.pop() else {
391                break;
392            };
393
394            lookback += span;
395
396            if self.tokens.pop().is_none() {
397                // Safety: Underlying TokenBuffer collections represent
398                //         a sequence of Chunks.
399                unsafe { ld_unreachable!("TokenBuffer inconsistency.") };
400            }
401
402            byte = match self.indices.pop() {
403                Some(index) => index,
404                // Safety: Underlying TokenBuffer collections represent
405                //         a sequence of Chunks.
406                None => unsafe { ld_unreachable!("TokenBuffer inconsistency.") },
407            };
408
409            site = match self.sites.pop() {
410                Some(site) => site,
411                // Safety: Underlying TokenBuffer collections represent
412                //         a sequence of Chunks.
413                None => unsafe { ld_unreachable!("TokenBuffer inconsistency.") },
414            }
415        }
416
417        self.text.push_str(text);
418        self.lines.append(text);
419
420        BufferLexisSession::run(self, byte, site);
421    }
422
423    /// Reserves capacity to store at least `tokens` number of the source code
424    /// tokens, and the `text` number of bytes to store the source code text.
425    pub fn reserve(&mut self, tokens: TokenCount, text: usize) {
426        self.tokens.reserve(tokens);
427        self.sites.reserve(tokens);
428        self.spans.reserve(tokens);
429        self.indices.reserve(tokens);
430        self.text.reserve(text);
431        self.lines.reserve(text);
432    }
433
434    /// Shrinks the allocation capacity of the TokenBuffer as much as possible.
435    pub fn shrink_to_fit(&mut self) {
436        self.tokens.shrink_to_fit();
437        self.sites.shrink_to_fit();
438        self.spans.shrink_to_fit();
439        self.indices.shrink_to_fit();
440        self.text.shrink_to_fit();
441        self.lines.shrink_to_fit();
442    }
443
444    /// Clears the TokenBuffer while preserving allocated memory.
445    pub fn clear(&mut self) {
446        self.tokens.clear();
447        self.sites.clear();
448        self.spans.clear();
449        self.indices.clear();
450        self.text.clear();
451        self.lines.clear();
452    }
453
454    #[inline(always)]
455    pub(crate) fn reset_id(&mut self) {
456        self.id = Id::new();
457    }
458
459    #[inline(always)]
460    pub(crate) fn update_line_index(&mut self) {
461        self.lines.clear();
462        self.lines.append(self.text.as_str());
463    }
464
465    #[inline]
466    pub(super) fn push(&mut self, token: T, from: &Cursor<Site>, to: &Cursor<Site>) {
467        let span = to.site - from.site;
468
469        ld_assert!(span > 0, "Empty span.");
470
471        let _ = self.tokens.push(token);
472        let _ = self.sites.push(from.site);
473        let _ = self.spans.push(span);
474        let _ = self.indices.push(from.byte);
475    }
476
477    #[inline]
478    fn search(&self, site: Site) -> Result<ByteIndex, (ByteIndex, Length)> {
479        if site >= self.length() {
480            return Ok(self.text.len());
481        }
482
483        match self.sites.binary_search(&site) {
484            Ok(index) => {
485                ld_assert!(index < self.indices.len(), "Index out of bounds.");
486
487                let byte_index = unsafe { self.indices.get_unchecked(index) };
488
489                Ok(*byte_index)
490            }
491
492            Err(mut index) => {
493                ld_assert!(index > 0, "Index out of bounds.");
494
495                index -= 1;
496
497                ld_assert!(index < self.indices.len(), "Index out of bounds.");
498                ld_assert!(index < self.sites.len(), "Index out of bounds.");
499
500                let byte_index = unsafe { self.indices.get_unchecked(index) };
501                let nearest_site = unsafe { self.sites.get_unchecked(index) };
502
503                Err((*byte_index, site - *nearest_site))
504            }
505        }
506    }
507}
508
509pub struct TokenBufferIter<'buffer, T: Token> {
510    site: Site,
511    tokens: Iter<'buffer, T>,
512    spans: Iter<'buffer, Length>,
513    indices: Peekable<Iter<'buffer, ByteIndex>>,
514    text: &'buffer str,
515}
516
517impl<'buffer, T: Token> Iterator for TokenBufferIter<'buffer, T> {
518    type Item = Chunk<'buffer, T>;
519
520    #[inline(always)]
521    fn next(&mut self) -> Option<Self::Item> {
522        let token = *self.tokens.next()?;
523        let site = self.site;
524        let length = *self.spans.next()?;
525        let start = *self.indices.next()?;
526
527        let string = match self.indices.peek() {
528            // Safety: TokenBuffer::indices are well-formed.
529            None => unsafe { self.text.get_unchecked(start..) },
530            // Safety: TokenBuffer::indices are well-formed.
531            Some(end) => unsafe { self.text.get_unchecked(start..**end) },
532        };
533
534        self.site += length;
535
536        Some(Self::Item {
537            token,
538            site,
539            length,
540            string,
541        })
542    }
543}
544
545impl<'buffer, T: Token> FusedIterator for TokenBufferIter<'buffer, T> {}