css_lexer/lib.rs
1#![deny(warnings)]
2//! An implementation of the [CSS Syntax Level 3 tokenization algorithm][1]. It is intended as a low-level building
3//! block for buidling parsers for CSS or CSS-alike languages (for example SASS).
4//!
5//! This crate provides the [Lexer] struct, which borrows `&str` and can incrementally produce [Tokens][Token]. The
6//! encoding of the `&str` is assumed to be utf-8.
7//!
8//! The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
9//! from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
10//! spec compliant.
11//!
12//! [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
13//! determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
14//! provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
15//! [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
16//! avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
17//!
18//! # Limitations
19//!
20//! The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
21//! [Cursor] small. It's very unlikely the average document will run into these limitations, but they're listed here
22//! for completeness:
23//!
24//! - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
25//! lex larger documents is considrered [undefined behaviour][2].
26//!
27//! - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
28//! If the lexer encounters a token with larger length this is considered [undefined behaviour][2].
29//!
30//! - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
31//! 17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
32//! (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
33//!
34//! - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
35//! For example encountering a dimension with 4,097 `0`s is considered [undefined behaviour][2].
36//!
37//! # General usage
38//!
39//! A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
40//! [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
41//! `&str` which it will reference to produce Tokens.
42//!
43//! Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
44//! newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
45//!
46//! # Example
47//!
48//! ```
49//! use css_lexer::*;
50//! let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
51//! assert_eq!(lexer.offset(), 0);
52//! {
53//! let token = lexer.advance();
54//! assert_eq!(token, Kind::Ident);
55//! let cursor = token.with_cursor(SourceOffset(0));
56//! assert_eq!(cursor.str_slice(lexer.source()), "width");
57//! }
58//! {
59//! let token = lexer.advance();
60//! assert_eq!(token, Kind::Colon);
61//! assert_eq!(token, ':');
62//! }
63//! {
64//! let token = lexer.advance();
65//! assert_eq!(token, Kind::Whitespace);
66//! }
67//! {
68//! let token = lexer.advance();
69//! assert_eq!(token, Kind::Dimension);
70//! }
71//! ```
72//!
73//! [1]: https://drafts.csswg.org/css-syntax/#tokenization
74//! [2]: https://en.wikipedia.org/wiki/Undefined_behavior
75
76mod associated_whitespace_rules;
77mod atom_set;
78mod comment_style;
79mod constants;
80mod cow;
81mod cursor;
82mod empty_atom_set;
83mod feature;
84mod kind;
85mod kindset;
86mod pairwise;
87mod private;
88mod quote_style;
89mod source_cursor;
90mod source_offset;
91mod span;
92mod syntax;
93mod token;
94mod whitespace_style;
95
96/// A convenience alias for the most common use case - a Lexer
97pub type BasicLexer<'a> = Lexer<'a>;
98
99pub use associated_whitespace_rules::AssociatedWhitespaceRules;
100pub use atom_set::{AtomSet, DynAtomSet};
101pub use comment_style::CommentStyle;
102pub use cow::CowStr;
103pub use cursor::Cursor;
104pub use empty_atom_set::EmptyAtomSet;
105pub use feature::Feature;
106pub use kind::Kind;
107pub use kindset::KindSet;
108pub use pairwise::PairWise;
109pub use quote_style::QuoteStyle;
110pub use source_cursor::SourceCursor;
111pub use source_offset::SourceOffset;
112pub use span::{Span, ToSpan};
113pub use token::Token;
114pub use whitespace_style::Whitespace;
115
116/// The [Lexer] struct - the core of the library - borrows `&str` and can incrementally produce [Tokens][Token].
117///
118/// The encoding of the `&str` is assumed to be utf-8. Other sources should be re-encoded into utf-8 prior to ingesting
119/// into the [Lexer].
120///
121/// The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
122/// from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
123/// spec compliant.
124///
125/// [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
126/// determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
127/// provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
128/// [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
129/// avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
130///
131/// # Limitations
132///
133/// The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
134/// [Cursor] small.
135///
136/// - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
137/// lex larger documents is considrered [undefined behaviour][2].
138///
139/// - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
140/// If the lexer encounters a token with larger length this is considered [undefined behaviour][2].
141///
142/// - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
143/// 17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
144/// (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
145///
146/// - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
147/// For example encountering a dimension with 4,097 `0` is considered [undefined behaviour][2].
148///
149/// # General usage
150///
151/// A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
152/// [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
153/// `&str` which it will reference to produce Tokens.
154///
155/// Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
156/// newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
157///
158/// # Example
159///
160/// ```
161/// use css_lexer::*;
162/// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
163/// assert_eq!(lexer.offset(), 0);
164/// {
165/// let token = lexer.advance();
166/// assert_eq!(token, Kind::Ident);
167/// let cursor = token.with_cursor(SourceOffset(0));
168/// assert_eq!(cursor.str_slice(lexer.source()), "width");
169/// }
170/// {
171/// let token = lexer.advance();
172/// assert_eq!(token, Kind::Colon);
173/// assert_eq!(token, ':');
174/// }
175/// {
176/// let token = lexer.advance();
177/// assert_eq!(token, Kind::Whitespace);
178/// }
179/// {
180/// let token = lexer.advance();
181/// assert_eq!(token, Kind::Dimension);
182/// }
183/// ```
184///
185/// [1]: https://drafts.csswg.org/css-syntax/#tokenization
186/// [2]: https://en.wikipedia.org/wiki/Undefined_behavior
187#[derive(Debug, Clone)]
188pub struct Lexer<'a> {
189 source: &'a str,
190 offset: SourceOffset,
191 token: Token,
192 features: Feature,
193 atoms: &'static dyn DynAtomSet,
194}
195
196impl<'a> Lexer<'a> {
197 #[inline]
198 pub fn new(atoms: &'static dyn DynAtomSet, source: &'a str) -> Self {
199 Self { source, offset: SourceOffset::default(), token: Token::default(), features: Feature::default(), atoms }
200 }
201
202 #[inline]
203 pub fn new_with_features(atoms: &'static dyn DynAtomSet, source: &'a str, features: Feature) -> Self {
204 Self { source, features, offset: SourceOffset::default(), token: Token::default(), atoms }
205 }
206
207 #[inline(always)]
208 pub fn source(&self) -> &'a str {
209 self.source
210 }
211
212 /// Is the lexer at the last token
213 pub fn at_end(&self) -> bool {
214 self.offset.0 as usize == self.source.len()
215 }
216
217 /// Current position in file
218 #[inline(always)]
219 pub const fn offset(&self) -> SourceOffset {
220 self.offset
221 }
222
223 #[inline(always)]
224 pub fn checkpoint(&self) -> Cursor {
225 Cursor::new(self.offset(), self.token)
226 }
227
228 /// Rewinds the lexer back to the given checkpoint
229 pub fn rewind(&mut self, cursor: Cursor) {
230 debug_assert!(cursor.offset() <= self.offset());
231 self.offset = cursor.offset();
232 self.token = cursor.token();
233 }
234
235 /// Advances the lexer to the end of the given token
236 pub fn hop(&mut self, cursor: Cursor) {
237 debug_assert!(cursor.offset().0 as usize >= (self.offset.0 + self.token.len()) as usize);
238 self.offset = cursor.offset();
239 self.token = cursor.token();
240 }
241
242 /// Moves the lexer one token forward, returning that token
243 pub fn advance(&mut self) -> Token {
244 self.token = self.read_next_token(self.offset.0);
245 self.offset.0 += self.token.len();
246 self.token
247 }
248}
249
250impl<'a> Iterator for Lexer<'a> {
251 type Item = Cursor;
252
253 #[inline]
254 fn next(&mut self) -> Option<Self::Item> {
255 if self.offset.0 as usize >= self.source.len() {
256 return None;
257 }
258 let offset = self.offset;
259 let token = self.advance();
260 if token.kind() == Kind::Eof { None } else { Some(token.with_cursor(offset)) }
261 }
262}
263
264#[test]
265fn size_test() {
266 assert_eq!(::std::mem::size_of::<Lexer>(), 48);
267}
268
269#[cfg(test)]
270mod iterator_tests {
271 use super::*;
272
273 #[test]
274 fn test_lexer_iterator_basic() {
275 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "foo bar");
276 let cursors: Vec<_> = lexer.collect();
277 assert_eq!(cursors.len(), 3); // ident, whitespace, ident
278 assert_eq!(cursors[0], Kind::Ident);
279 assert_eq!(cursors[1], Kind::Whitespace);
280 assert_eq!(cursors[2], Kind::Ident);
281 }
282
283 #[test]
284 fn test_lexer_iterator_empty() {
285 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "");
286 let cursors: Vec<_> = lexer.collect();
287 assert_eq!(cursors.len(), 0);
288 }
289
290 #[test]
291 fn test_lexer_iterator_equivalence() {
292 let source = "width: 1px";
293
294 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
295 let cursors: Vec<_> = lexer.collect();
296
297 let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
298 let mut manual_cursors = Vec::new();
299 while !lexer.at_end() {
300 let offset = lexer.offset();
301 let token = lexer.advance();
302 if token.kind() != Kind::Eof {
303 manual_cursors.push(token.with_cursor(offset));
304 }
305 }
306
307 assert_eq!(cursors.len(), manual_cursors.len());
308 for (c1, c2) in cursors.iter().zip(manual_cursors.iter()) {
309 assert_eq!(c1.token().kind(), c2.token().kind());
310 assert_eq!(c1.offset(), c2.offset());
311 }
312 }
313
314 #[test]
315 fn test_lexer_iterator_clone() {
316 let source = "foo bar baz";
317 let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
318
319 let first = lexer.next();
320 assert!(first.is_some());
321 assert_eq!(first.unwrap(), Kind::Ident);
322
323 let lexer_clone = lexer.clone();
324
325 let cursors1: Vec<_> = lexer.collect();
326 let cursors2: Vec<_> = lexer_clone.collect();
327
328 assert_eq!(cursors1.len(), cursors2.len());
329 for (c1, c2) in cursors1.iter().zip(cursors2.iter()) {
330 assert_eq!(c1.token().kind(), c2.token().kind());
331 assert_eq!(c1.offset(), c2.offset());
332 }
333 }
334}