lady_deirdre/lexis/code.rs
1////////////////////////////////////////////////////////////////////////////////
2// This file is part of "Lady Deirdre", a compiler front-end foundation //
3// technology. //
4// //
5// This work is proprietary software with source-available code. //
6// //
7// To copy, use, distribute, or contribute to this work, you must agree to //
8// the terms of the General License Agreement: //
9// //
10// https://github.com/Eliah-Lakhin/lady-deirdre/blob/master/EULA.md //
11// //
12// The agreement grants a Basic Commercial License, allowing you to use //
13// this work in non-commercial and limited commercial products with a total //
14// gross revenue cap. To remove this commercial limit for one of your //
15// products, you must acquire a Full Commercial License. //
16// //
17// If you contribute to the source code, documentation, or related materials, //
18// you must grant me an exclusive license to these contributions. //
19// Contributions are governed by the "Contributions" section of the General //
20// License Agreement. //
21// //
22// Copying the work in parts is strictly forbidden, except as permitted //
23// under the General License Agreement. //
24// //
25// If you do not or cannot agree to the terms of this Agreement, //
26// do not use this work. //
27// //
28// This work is provided "as is", without any warranties, express or implied, //
29// except where such disclaimers are legally invalid. //
30// //
31// Copyright (c) 2024 Ilya Lakhin (Илья Александрович Лахин). //
32// All rights reserved. //
33////////////////////////////////////////////////////////////////////////////////
34
35use std::{borrow::Cow, iter::FusedIterator, marker::PhantomData};
36
37use crate::{
38 arena::{Entry, Identifiable},
39 lexis::{Chunk, Length, LineIndex, Site, SiteRef, ToSpan, Token, TokenCount, TokenCursor},
40};
41
42/// An object that provides access to the source code text and the lexical
43/// structure of a compilation unit.
44///
45/// The lexical structure is a sequence of [tokens](crate::lexis::Token) that
46/// fully covers the source code text exhaustively and without overlaps.
47///
48/// This trait provides a low-level interface to access tokens and their
49/// metadata by the [versioned index](Entry) from the compilation unit.
50///
51/// A higher-level referential object, [TokenRef](crate::lexis::TokenRef),
52/// offers a more convenient interface to access these objects from
53/// the SourceCode.
54///
55/// Additionally, the SourceCode interface provides functions to access
56/// substrings of the source code text, to iterate through individual characters
57/// in range, and to iterate through the tokens and their metadata in range.
58pub trait SourceCode: Identifiable {
59 /// Specifies the type of the source code token and the lexical scanner
60 /// of a programming language through the [Token::scan] function.
61 type Token: Token;
62
63 /// Specifies the type of the [token cursor](TokenCursor) that iterates
64 /// through the streams of tokens of this source code type.
65 type Cursor<'code>: TokenCursor<'code, Token = Self::Token>
66 where
67 Self: 'code;
68
69 /// Specifies the type of the iterator that iterates through the unicode
70 /// characters of the source code text substrings.
71 type CharIterator<'code>: Iterator<Item = char> + FusedIterator + 'code
72 where
73 Self: 'code;
74
75 /// Returns an iterator of the source code tokens [metadata](Chunk)
76 /// in the specified `span`.
77 ///
78 /// The iterator will yield all token chunks if the `span` intersects with
79 /// the token spans, including the span bounds.
80 ///
81 /// For example, in the text `FooBarBaz` with three tokens, the span `3..6`
82 /// will iterate through all three tokens because this span covers
83 /// the token `Bar` and intersects with the tokens `Foo` and `Baz`
84 /// by their bounds. The span `3..3` would iterate through the `Foo` and
85 /// `Bar` tokens because this span intersects with these two token bounds.
86 /// The span `4..5` would yield the `Bar` token only because this span
87 /// intersects with the token's span.
88 ///
89 /// **Panic**
90 ///
91 /// This function may panic if the specified `span` is not
92 /// [valid](ToSpan::is_valid_span) for this source code.
93 #[inline(always)]
94 fn chunks(&self, span: impl ToSpan) -> ChunkIter<'_, Self::Cursor<'_>>
95 where
96 Self: Sized,
97 {
98 let span = match span.to_site_span(self) {
99 None => panic!("Specified span is invalid."),
100 Some(span) => span,
101 };
102
103 let cursor = self.cursor(span.clone());
104
105 ChunkIter {
106 cursor,
107 _code_lifetime: PhantomData::default(),
108 }
109 }
110
111 /// Returns an iterator that iterates over the [Unicode chars](char) of the
112 /// source code text substring in the specified `span`.
113 ///
114 /// For example, in the text `FooBarBaz` with three tokens, the span `2..6`
115 /// will iterate the characters of the `oBar` substring.
116 ///
117 /// **Panic**
118 ///
119 /// This function may panic if the specified `span` is not
120 /// [valid](ToSpan::is_valid_span) for this source code.
121 fn chars(&self, span: impl ToSpan) -> Self::CharIterator<'_>;
122
123 /// Returns a borrowed or an owned substring of the source code text in
124 /// the specified `span`.
125 ///
126 /// For example, in the text `FooBarBaz` with three tokens, the span `2..6`
127 /// will yield an `oBar` substring.
128 ///
129 /// The decision about the returning string ownership is implementation
130 /// dependent.
131 ///
132 /// **Panic**
133 ///
134 /// This function may panic if the specified `span` is not
135 /// [valid](ToSpan::is_valid_span) for this source code.
136 fn substring(&self, span: impl ToSpan) -> Cow<str>
137 where
138 Self: Sized,
139 {
140 let span = match span.to_site_span(self) {
141 None => panic!("Specified span is invalid."),
142 Some(span) => span,
143 };
144
145 let mut cursor = self.cursor(span.clone());
146
147 if cursor.site(0) == Some(span.start) && cursor.site(1) == Some(span.end) {
148 if let Some(string) = cursor.string(0) {
149 return Cow::Borrowed(string);
150 }
151 }
152
153 Cow::from(self.chars(span).collect::<String>())
154 }
155
156 /// Checks if the token referred to by the versioned index exists in this
157 /// source code.
158 fn has_chunk(&self, entry: &Entry) -> bool;
159
160 /// Returns a copy of the token referred to by the versioned index.
161 ///
162 /// If the index parameter `entry` is not valid, returns None.
163 fn get_token(&self, entry: &Entry) -> Option<Self::Token>;
164
165 /// Returns a start site of the token referred to by the versioned index.
166 ///
167 /// If the index parameter `entry` is not valid, returns None.
168 fn get_site(&self, entry: &Entry) -> Option<Site>;
169
170 /// Returns a reference to the source code text substring covered by
171 /// the token referred to by the versioned index.
172 ///
173 /// If the index parameter `entry` is not valid, returns None.
174 fn get_string(&self, entry: &Entry) -> Option<&str>;
175
176 /// Returns the [length](Length) of the source code text substring covered
177 /// by the token referred to by the versioned index.
178 ///
179 /// If the index parameter `entry` is not valid, returns None.
180 fn get_length(&self, entry: &Entry) -> Option<Length>;
181
182 /// Returns a cursor of the source code tokens stream in
183 /// the specified `span`.
184 ///
185 /// The token stream includes a source code token if the `span` intersects
186 /// with the span of this token, including the span bounds.
187 ///
188 /// The returning object does not implement the [Iterator] interface
189 /// but provides the capabilities for manual control over the iteration
190 /// process, including the capabilities to look ahead of the token stream.
191 ///
192 /// If you need just a normal Iterator over the token chunks,
193 /// use the [chunks](Self::chunks) function instead.
194 ///
195 /// **Panic**
196 ///
197 /// This function may panic if the specified `span` is not
198 /// [valid](ToSpan::is_valid_span) for this source code.
199 fn cursor(&self, span: impl ToSpan) -> Self::Cursor<'_>;
200
201 /// Returns a [SiteRef] that points to the end of this source code.
202 #[inline(always)]
203 fn end_site_ref(&self) -> SiteRef {
204 SiteRef::end_of(self.id())
205 }
206
207 /// Returns the total number of Unicode characters in this source code text.
208 fn length(&self) -> Length;
209
210 /// Returns the total number of tokens in this source code.
211 fn tokens(&self) -> TokenCount;
212
213 /// Provides access to the [line index](LineIndex) of this source code.
214 ///
215 /// From this object, you can convert char sites back and forth to their
216 /// line indices, and to reveal the total number of lines in this source
217 /// code.
218 fn lines(&self) -> &LineIndex;
219
220 /// Returns true if the source code text is empty.
221 #[inline(always)]
222 fn is_empty(&self) -> bool {
223 self.length() == 0
224 }
225}
226
227/// An iterator over the [SourceCode] tokens and their metadata.
228///
229/// This object is created by the [SourceCode::chunks] function.
230#[repr(transparent)]
231pub struct ChunkIter<'code, C: TokenCursor<'code>> {
232 cursor: C,
233 _code_lifetime: PhantomData<&'code ()>,
234}
235
236impl<'code, C: TokenCursor<'code>> Iterator for ChunkIter<'code, C> {
237 type Item = Chunk<'code, <C as TokenCursor<'code>>::Token>;
238
239 #[inline]
240 fn next(&mut self) -> Option<Self::Item> {
241 let token = self.cursor.token(0);
242 let site = self.cursor.site(0)?;
243 let length = self.cursor.length(0)?;
244 let string = self.cursor.string(0)?;
245
246 if !self.cursor.advance() {
247 return None;
248 }
249
250 Some(Self::Item {
251 token,
252 site,
253 length,
254 string,
255 })
256 }
257}
258
259impl<'code, C: TokenCursor<'code>> FusedIterator for ChunkIter<'code, C> {}