Skip to main content

peacock_crest/
source.rs

1use pest::{Parser, RuleType, Token};
2
3use std::cell::{Cell, OnceCell};
4use std::sync::{Arc, Weak};
5
6use crate::boo::Boo;
7
8const NEWLINE_DEFINITIONS: [&str; 4] = ["\n", "\r\n", "\r", "\x0C"];
9const WHITESPACE_DEFINITIONS: [&str; 6] = [" ", "\t", "\n", "\r\n", "\r", "\x0C"];
10
11#[derive(Clone)]
12pub struct SourceInfo {
13    filename: Option<Arc<str>>,
14    source: Arc<str>,
15    newline_indices: Box<[usize]>,
16    handle: OnceCell<Weak<Self>>,
17}
18
19#[derive(Debug, Clone)]
20pub struct SourceLocation {
21    source_info: Arc<SourceInfo>,
22    pub idx: usize,
23    /// starts at 1
24    pub line: usize,
25    /// starts at 1
26    pub column: usize,
27}
28
29#[derive(Debug, Clone)]
30pub struct SourceSlice {
31    pub(crate) source_info: Arc<SourceInfo>,
32    pub(crate) start: SourceLocation,
33    pub(crate) end: SourceLocation,
34}
35
36#[derive(Debug, Clone)]
37pub(crate) struct StackInfo<T> {
38    pub source_info: Arc<SourceInfo>,
39    pub rule: T,
40    pub positions: (SourceLocation, Option<SourceLocation>),
41    pub children: Vec<StackInfo<T>>,
42}
43
44#[derive(Debug, Clone)]
45pub struct ParserToken<R: RuleType> {
46    value: ParserTokenValue<Self>,
47    rule: R,
48    start: SourceLocation,
49    end: SourceLocation,
50}
51
52#[derive(Debug, Clone)]
53pub enum ParserTokenValue<T> {
54    Leaf,
55    Internal(Vec<T>),
56}
57
58#[derive(Debug, Clone, derive_more::From)]
59pub enum ExpectError {
60    #[from]
61    SyntaxError(crate::syntax::CssExpectError),
62    #[from]
63    SelectorError(crate::selector::SelectorExpectError),
64
65    #[from]
66    Generic(String),
67}
68
69/// `TokenTracker`s are utilities for managing and parsing a sequence of `ParserToken`s.
70///
71/// This struct provides two primary features:
72/// 1. A mechanism to "fake" token pops without mutating the underlying token vector, enabling
73///    references to popped tokens to outlive the scope of the popping function.
74/// 2. Convenient parsing functions (`expect_*`) to validate and extract specific token types,
75///    returning descriptive errors (`ExpectError`) on failure.
76///
77/// Token popping is achieved by maintaining an internal offset index. Each pop operation
78/// increments this index, effectively "consuming" tokens while preserving the underlying
79/// vector's immutability. Once a token is "consumed," it cannot be revisited.
80///
81/// # Behavior
82/// - Tokens are "consumed" in order, starting from the current offset.
83/// - If an expectation fails, the offset remains unchanged.
84/// - Tokens retain their ownership within `CssTokenTracker`, ensuring their lifetime matches
85///   that of the tracker.
86///
87/// # Example Usage
88/// For examples on usage, see `crate::selector::SelectorTokenTracker` or
89/// `crate::syntax::CssTokenTracker`.
90#[derive(Debug, Clone)]
91pub struct TokenTracker<'a, R: RuleType> {
92    /// A `Boo` wrapper around a `Vec<ParserToken<R>>`. Neither `Boo` nor `TokenTracker` guarantee
93    /// that the contained vec is owned by a `TokenTracker`, but the design of `TokenTracker`
94    /// assumes that any `boo` field adheres to this contract.
95    ///
96    /// The `boo` field enables the use of either owned or borrowed tokens, reducing
97    /// redundancy in parsing implementations. It is essential that this field is only
98    /// initialized within `TokenTracker` or similar contexts that maintain this contract.
99    ///
100    /// ## Notes:
101    /// - Any modifications to the `boo` field should only occur within the
102    ///   `TokenTracker` implementation to ensure the contract remains valid.
103    /// - This abstraction avoids the need for separate types (e.g., `TokenTrackerBorrowed`)
104    ///   by enabling flexible token ownership semantics.
105    ///
106    /// Misuse of the `boo` field outside its intended context can lead to undefined behavior
107    /// in parsing operations. Use with caution and respect the ownership semantics.
108    pub(crate) boo: crate::boo::Boo<'a, Vec<ParserToken<R>>>,
109
110    pub(crate) idx: Cell<usize>,
111}
112
113// =============== IMPL ===============
114
115/// A modified binary search algorithm that will find the exact or nearest
116/// floored index of the given item.
117///
118/// Items outside the range of values in the array will return `None`.
119/// Assumes the array is already sorted, otherwise the returned value is
120/// meaningless.
121///
122/// ## Asserts
123/// ```rust
124/// fn floored_binary_index(arr: &[usize], item: usize) -> Option<usize> {
125///     let length = arr.len();
126///
127///     if item < arr[0] || item > arr[length - 1] {
128///         return None;
129///     }
130///
131///     let mut left = 0;
132///     let mut right = length - 1;
133///     let mut middle = length / 2;
134///
135///     let mut current = arr[middle];
136///     while left <= right {
137///         match current.cmp(&item) {
138///             std::cmp::Ordering::Less => left = middle + 1,
139///             std::cmp::Ordering::Equal => return Some(middle),
140///             std::cmp::Ordering::Greater => right = middle - 1,
141///         }
142///         middle = (right + left) / 2;
143///         current = arr[middle];
144///     }
145///
146///     Some(middle)
147/// }
148///
149/// assert!(floored_binary_index(&[1, 3, 5, 9, 15], 0).is_none(), "Should fail because 0 is out of the range 0-15");
150/// assert!(floored_binary_index(&[1, 3, 5, 9, 15], 16).is_none(), "Should fail because 16 is out of the range 0-15");
151///
152/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 2).unwrap(), 0);
153/// //                                  ^ would be here, so use previous index: 0
154///
155/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 4).unwrap(), 1);
156/// //                                     ^ would be here, so use previous index: 1
157///
158/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 6).unwrap(), 2);
159/// //                                        ^ would be here, so use previous index: 2
160///
161/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 8).unwrap(), 2);
162/// //                                        ^ would be here, so use previous index: 2
163///
164/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 14).unwrap(), 3);
165/// //   would be here, so use previous index: 3 ^
166///
167/// assert_eq!(floored_binary_index(&[1, 3, 5, 9, 15], 15).unwrap(), 4);
168/// ```
169fn floored_binary_index(arr: &[usize], item: usize) -> Option<usize> {
170    let length = arr.len();
171
172    if item < arr[0] || item > arr[length - 1] {
173        return None;
174    }
175
176    let mut left = 0;
177    let mut right = length - 1;
178    let mut middle = length / 2;
179
180    let mut current = arr[middle];
181    while left <= right {
182        match current.cmp(&item) {
183            std::cmp::Ordering::Less => left = middle + 1,
184            std::cmp::Ordering::Equal => return Some(middle),
185            std::cmp::Ordering::Greater => right = middle - 1,
186        }
187        middle = (right + left) / 2;
188        current = arr[middle];
189    }
190
191    Some(middle)
192}
193
194impl SourceInfo {
195    pub fn new(source: Arc<str>) -> Arc<Self> {
196        let mut newline_indices = vec![0];
197
198        for i in 0..(source.len() - 1) {
199            if NEWLINE_DEFINITIONS
200                .iter()
201                .any(|&x| source[i..i + 2].starts_with(x))
202            {
203                newline_indices.push(i);
204            }
205        }
206
207        let new = Self {
208            filename: None,
209            source: source.into(),
210            newline_indices: newline_indices.into_boxed_slice(),
211            handle: OnceCell::new(),
212        };
213
214        let arc: Arc<Self> = Arc::new(new);
215        arc.handle
216            .set(Arc::downgrade(&arc))
217            .expect("OnceCell should only be initialized once");
218        arc
219    }
220
221    pub fn from_file(filepath: &std::path::Path) -> Arc<Self> {
222        let mut newline_indices = vec![0];
223
224        let source_raw: String =
225            std::fs::read_to_string(filepath).expect(&format!("Failed to read file {filepath:?}"));
226        let source: Arc<str> = source_raw.as_str().into();
227
228        let max_length = NEWLINE_DEFINITIONS.iter().map(|x| x.len()).max().unwrap();
229        for i in 0..(source.len() - 1) {
230            if NEWLINE_DEFINITIONS
231                .iter()
232                .any(|&x| source[i..i + max_length].starts_with(x))
233            {
234                newline_indices.push(i);
235            }
236        }
237
238        let new = Self {
239            filename: Some(Arc::from(filepath.to_string_lossy().into_owned())),
240            source,
241            newline_indices: newline_indices.into_boxed_slice(),
242            handle: OnceCell::new(),
243        };
244
245        let arc: Arc<Self> = Arc::new(new);
246        arc.handle
247            .set(Arc::downgrade(&arc))
248            .expect("OnceCell should only be initialized once");
249        arc
250    }
251
252    #[inline]
253    pub fn get_handle(&self) -> Arc<Self> {
254        self.handle.get().unwrap().upgrade().unwrap()
255    }
256
257    pub fn location_from_idx(&self, idx: usize) -> SourceLocation {
258        let mut line = floored_binary_index(&self.newline_indices, idx);
259        let line = line
260            .map(|x| x + 1)
261            .or_else(|| {
262                if idx < self.newline_indices[0] {
263                    Some(1)
264                } else {
265                    Some(self.newline_indices.len())
266                }
267            })
268            .unwrap();
269
270        if self.newline_indices[line - 1] > idx {
271            panic!(
272                "{} > {idx}\n{:?}",
273                self.newline_indices[line - 1],
274                self.newline_indices
275            );
276        }
277
278        SourceLocation {
279            source_info: self.get_handle(),
280            idx,
281            line,
282            column: idx - self.newline_indices[line - 1] + 1,
283        }
284    }
285}
286
287impl SourceLocation {
288    pub fn slice(&self, other: &Self) -> SourceSlice {
289        assert!(
290            Arc::ptr_eq(&self.source_info, &other.source_info),
291            "Cannot slice between 2 different sources!"
292        );
293        SourceSlice {
294            source_info: self.source_info.clone(),
295            start: self.clone(),
296            end: other.clone(),
297        }
298    }
299}
300
301impl SourceSlice {
302    #[inline]
303    pub fn get(&self) -> &str {
304        &self.source_info.source[self.start.idx..self.end.idx]
305    }
306}
307
308impl std::fmt::Debug for SourceInfo {
309    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
310        write!(f, "SourceInfo {{ ... }}")
311    }
312}
313
314impl std::fmt::Display for SourceSlice {
315    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
316        write!(f, "{}", self.get())
317    }
318}
319
320impl std::ops::Deref for SourceSlice {
321    type Target = str;
322
323    fn deref(&self) -> &Self::Target {
324        self.get()
325    }
326}
327
328impl std::cmp::PartialEq for SourceSlice {
329    fn eq(&self, other: &Self) -> bool {
330        Arc::ptr_eq(&self.source_info, &other.source_info)
331            && self.start.idx == other.start.idx
332            && self.end.idx == other.end.idx
333    }
334}
335
336impl std::cmp::Eq for SourceSlice {}
337
338impl std::hash::Hash for SourceSlice {
339    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
340        Arc::as_ptr(&self.source_info.source).hash(state);
341        self.start.idx.hash(state);
342        self.end.idx.hash(state);
343    }
344}
345
346impl<R: RuleType> StackInfo<R> {
347    fn new(source_info: Arc<SourceInfo>, rule: R, pos: SourceLocation) -> Self {
348        Self {
349            source_info,
350            rule,
351            positions: (pos, None),
352            children: Vec::new(),
353        }
354    }
355}
356
357impl<R: RuleType> ParserToken<R> {
358    fn new(value: StackInfo<R>) -> Self {
359        let token_val = if !value.children.is_empty() {
360            let mut children: Vec<Self> = value
361                .children
362                .into_iter()
363                .map(|x| Self::new(x))
364                .collect::<Vec<_>>();
365            ParserTokenValue::Internal(children)
366        } else {
367            ParserTokenValue::Leaf
368        };
369
370        Self {
371            value: token_val,
372            rule: value.rule,
373            start: value.positions.0,
374            end: value.positions.1.unwrap(),
375        }
376    }
377
378    pub fn is_leaf(&self) -> bool {
379        matches!(self.value, ParserTokenValue::Leaf)
380    }
381
382    pub fn get_source(&self) -> SourceSlice {
383        self.start.slice(&self.end)
384    }
385
386    pub fn get_indices(&self) -> (SourceLocation, SourceLocation) {
387        (self.start.clone(), self.end.clone())
388    }
389
390    pub fn get_rule(&self) -> R {
391        self.rule
392    }
393
394    pub fn get_children(&self) -> Option<&Vec<ParserToken<R>>> {
395        match &self.value {
396            ParserTokenValue::Leaf => None,
397            ParserTokenValue::Internal(vec) => Some(vec),
398        }
399    }
400}
401
402impl<'a, R: RuleType> TokenTracker<'a, R> {
403    /// Creates a new `SelectorTokenTracker` from a `Boo` wrapping a vector of `SelectorToken`s.
404    pub fn new(tokens_boo: Boo<'a, Vec<ParserToken<R>>>) -> Self {
405        Self {
406            boo: tokens_boo,
407            idx: Cell::new(0),
408        }
409    }
410
411    /// Retrieves a reference to the next token in the vector without consuming it.
412    ///
413    /// This method allows peeking at the current token for introspection, such as determining
414    /// its type or length of child tokens, without advancing the offset index.
415    #[inline]
416    pub fn peek(&self) -> Option<&ParserToken<R>> {
417        self.boo.get(self.idx.get())
418    }
419
420    #[inline]
421    pub(crate) fn fail_because<O, E>(&self, error: E) -> Result<O, E> {
422        self.idx.set(0.max(self.idx.get() - 1));
423        Err(error)
424    }
425
426    #[inline]
427    pub(crate) fn get_location(&self) -> Option<SourceSlice> {
428        self.boo
429            .get_ref()
430            .get(self.idx.get())
431            .or_else(|| self.boo.get_ref().get(self.idx.get() - 1))
432            .map(|x| x.get_source())
433    }
434
435    /// Consumes the next token and returns a reference to it, or `None` if no tokens remain.
436    ///
437    /// This increments the internal offset index, marking the token as consumed.
438    pub(crate) fn pop_front(&'a self) -> Option<&'a ParserToken<R>> {
439        let result = self.boo.get(self.idx.get());
440        if result.is_some() {
441            self.idx.set(self.idx.get() + 1);
442        }
443        result
444    }
445
446    /// Consumes and returns references to the next `count` tokens, or `None` if fewer tokens
447    /// remain than requested.
448    ///
449    /// This method advances the offset index by `count` if successful.
450    pub(crate) fn pop_front_count(&'a self, count: usize) -> Option<Box<[&'a ParserToken<R>]>> {
451        if self.boo.len() < self.idx.get() + count {
452            return None;
453        }
454
455        let mut tokens = Vec::new();
456        tokens.reserve(count);
457
458        for i in 0..count {
459            tokens.push(self.boo.get(self.idx.get() + i).unwrap());
460        }
461
462        self.idx.set(self.idx.get() + count);
463        Some(tokens.into_boxed_slice())
464    }
465
466    /// Returns the number of unconsumed tokens remaining in the vector.
467    #[inline]
468    pub fn len(&self) -> usize {
469        self.boo.len() - self.idx.get()
470    }
471
472    /// Returns true if there are still consumable tokens, else returns false
473    #[inline]
474    pub fn is_empty(&self) -> bool {
475        self.boo.is_empty() || self.len() == 0
476    }
477}
478
479pub fn parse_source<R: RuleType, P: Parser<R>>(
480    source_info: Arc<SourceInfo>,
481    rule: R,
482) -> Result<ParserToken<R>, pest::error::Error<R>> {
483    let pairs = P::parse(rule, &source_info.source)?;
484
485    let mut tokens = pairs.tokens();
486
487    let mut stack: Vec<StackInfo<R>> = Vec::new();
488    let mut root: StackInfo<R> = {
489        match tokens.next().unwrap() {
490            Token::Start { rule, pos } => {
491                let pos = source_info.location_from_idx(pos.pos());
492                StackInfo::new(source_info.clone(), rule.clone(), pos)
493            }
494            _ => unreachable!(),
495        }
496    };
497
498    for token in tokens {
499        match token {
500            Token::Start { rule, pos } => {
501                let pos = source_info.location_from_idx(pos.pos());
502                stack.push(StackInfo::new(source_info.clone(), rule.clone(), pos));
503            }
504            Token::End { rule, pos } => {
505                if let Some(mut css_token) = stack.pop() {
506                    css_token.positions.1 = Some(source_info.location_from_idx(pos.pos()));
507
508                    if let Some(parent) = stack.last_mut() {
509                        parent.children.push(css_token);
510                    } else {
511                        root.children.push(css_token);
512                    }
513                } else {
514                    root.positions.1 = Some(source_info.location_from_idx(pos.pos()));
515                }
516            }
517        }
518    }
519
520    Ok(ParserToken::new(root))
521}