kcl_lib/parsing/token/
mod.rs

1// Clippy does not agree with rustc here for some reason.
2#![allow(clippy::needless_lifetimes)]
3
4use std::{fmt, iter::Enumerate, num::NonZeroUsize, str::FromStr};
5
6use anyhow::Result;
7use parse_display::Display;
8use schemars::JsonSchema;
9use serde::{Deserialize, Serialize};
10use tokeniser::Input;
11use tower_lsp::lsp_types::SemanticTokenType;
12use winnow::{
13    self,
14    error::ParseError,
15    stream::{ContainsToken, Stream},
16};
17
18use crate::{
19    CompilationError, ModuleId,
20    errors::KclError,
21    parsing::ast::types::{ItemVisibility, VariableKind},
22    source_range::SourceRange,
23};
24
25mod tokeniser;
26
27pub(crate) use tokeniser::RESERVED_WORDS;
28
29// Note the ordering, it's important that `m` comes after `mm` and `cm`.
30pub const NUM_SUFFIXES: [&str; 10] = ["mm", "cm", "m", "inch", "in", "ft", "yd", "deg", "rad", "?"];
31
32#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize, ts_rs::TS, JsonSchema)]
33#[repr(u32)]
34pub enum NumericSuffix {
35    None,
36    Count,
37    Length,
38    Angle,
39    Mm,
40    Cm,
41    M,
42    Inch,
43    Ft,
44    Yd,
45    Deg,
46    Rad,
47    Unknown,
48}
49
50impl NumericSuffix {
51    #[allow(dead_code)]
52    pub fn is_none(self) -> bool {
53        self == Self::None
54    }
55
56    pub fn is_some(self) -> bool {
57        self != Self::None
58    }
59
60    pub fn digestable_id(&self) -> &[u8] {
61        match self {
62            NumericSuffix::None => &[],
63            NumericSuffix::Count => b"_",
64            NumericSuffix::Unknown => b"?",
65            NumericSuffix::Length => b"Length",
66            NumericSuffix::Angle => b"Angle",
67            NumericSuffix::Mm => b"mm",
68            NumericSuffix::Cm => b"cm",
69            NumericSuffix::M => b"m",
70            NumericSuffix::Inch => b"in",
71            NumericSuffix::Ft => b"ft",
72            NumericSuffix::Yd => b"yd",
73            NumericSuffix::Deg => b"deg",
74            NumericSuffix::Rad => b"rad",
75        }
76    }
77}
78
79impl FromStr for NumericSuffix {
80    type Err = CompilationError;
81
82    fn from_str(s: &str) -> Result<Self, Self::Err> {
83        match s {
84            "_" | "Count" => Ok(NumericSuffix::Count),
85            "Length" => Ok(NumericSuffix::Length),
86            "Angle" => Ok(NumericSuffix::Angle),
87            "mm" | "millimeters" => Ok(NumericSuffix::Mm),
88            "cm" | "centimeters" => Ok(NumericSuffix::Cm),
89            "m" | "meters" => Ok(NumericSuffix::M),
90            "inch" | "in" => Ok(NumericSuffix::Inch),
91            "ft" | "feet" => Ok(NumericSuffix::Ft),
92            "yd" | "yards" => Ok(NumericSuffix::Yd),
93            "deg" | "degrees" => Ok(NumericSuffix::Deg),
94            "rad" | "radians" => Ok(NumericSuffix::Rad),
95            "?" => Ok(NumericSuffix::Unknown),
96            _ => Err(CompilationError::err(SourceRange::default(), "invalid unit of measure")),
97        }
98    }
99}
100
101impl fmt::Display for NumericSuffix {
102    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
103        match self {
104            NumericSuffix::None => Ok(()),
105            NumericSuffix::Count => write!(f, "_"),
106            NumericSuffix::Unknown => write!(f, "_?"),
107            NumericSuffix::Length => write!(f, "Length"),
108            NumericSuffix::Angle => write!(f, "Angle"),
109            NumericSuffix::Mm => write!(f, "mm"),
110            NumericSuffix::Cm => write!(f, "cm"),
111            NumericSuffix::M => write!(f, "m"),
112            NumericSuffix::Inch => write!(f, "in"),
113            NumericSuffix::Ft => write!(f, "ft"),
114            NumericSuffix::Yd => write!(f, "yd"),
115            NumericSuffix::Deg => write!(f, "deg"),
116            NumericSuffix::Rad => write!(f, "rad"),
117        }
118    }
119}
120
121#[derive(Clone, Debug, PartialEq)]
122pub(crate) struct TokenStream {
123    tokens: Vec<Token>,
124}
125
126impl TokenStream {
127    fn new(tokens: Vec<Token>) -> Self {
128        Self { tokens }
129    }
130
131    pub(super) fn remove_unknown(&mut self) -> Vec<Token> {
132        let tokens = std::mem::take(&mut self.tokens);
133        let (tokens, unknown_tokens): (Vec<Token>, Vec<Token>) = tokens
134            .into_iter()
135            .partition(|token| token.token_type != TokenType::Unknown);
136        self.tokens = tokens;
137        unknown_tokens
138    }
139
140    pub fn iter(&self) -> impl Iterator<Item = &Token> {
141        self.tokens.iter()
142    }
143
144    pub fn is_empty(&self) -> bool {
145        self.tokens.is_empty()
146    }
147
148    pub fn as_slice(&self) -> TokenSlice {
149        TokenSlice::from(self)
150    }
151}
152
153impl<'a> From<&'a TokenStream> for TokenSlice<'a> {
154    fn from(stream: &'a TokenStream) -> Self {
155        TokenSlice {
156            start: 0,
157            end: stream.tokens.len(),
158            stream,
159        }
160    }
161}
162
163impl IntoIterator for TokenStream {
164    type Item = Token;
165
166    type IntoIter = std::vec::IntoIter<Token>;
167
168    fn into_iter(self) -> Self::IntoIter {
169        self.tokens.into_iter()
170    }
171}
172
173#[derive(Debug, Clone)]
174pub(crate) struct TokenSlice<'a> {
175    stream: &'a TokenStream,
176    /// Current position of the leading Token in the stream
177    start: usize,
178    /// The number of total Tokens in the stream
179    end: usize,
180}
181
182impl<'a> std::ops::Deref for TokenSlice<'a> {
183    type Target = [Token];
184
185    fn deref(&self) -> &Self::Target {
186        &self.stream.tokens[self.start..self.end]
187    }
188}
189
190impl<'a> TokenSlice<'a> {
191    pub fn token(&self, i: usize) -> &Token {
192        &self.stream.tokens[i + self.start]
193    }
194
195    pub fn iter(&self) -> impl Iterator<Item = &Token> {
196        (**self).iter()
197    }
198
199    pub fn without_ends(&self) -> Self {
200        Self {
201            start: self.start + 1,
202            end: self.end - 1,
203            stream: self.stream,
204        }
205    }
206
207    pub fn as_source_range(&self) -> SourceRange {
208        let stream_len = self.stream.tokens.len();
209        let first_token = if stream_len == self.start {
210            &self.stream.tokens[self.start - 1]
211        } else {
212            self.token(0)
213        };
214        let last_token = if stream_len == self.end {
215            &self.stream.tokens[stream_len - 1]
216        } else {
217            self.token(self.end - self.start)
218        };
219        SourceRange::new(first_token.start, last_token.end, last_token.module_id)
220    }
221}
222
223impl<'a> IntoIterator for TokenSlice<'a> {
224    type Item = &'a Token;
225
226    type IntoIter = std::slice::Iter<'a, Token>;
227
228    fn into_iter(self) -> Self::IntoIter {
229        self.stream.tokens[self.start..self.end].iter()
230    }
231}
232
233impl<'a> Stream for TokenSlice<'a> {
234    type Token = Token;
235    type Slice = Self;
236    type IterOffsets = Enumerate<std::vec::IntoIter<Token>>;
237    type Checkpoint = Checkpoint;
238
239    fn iter_offsets(&self) -> Self::IterOffsets {
240        #[allow(clippy::unnecessary_to_owned)]
241        self.to_vec().into_iter().enumerate()
242    }
243
244    fn eof_offset(&self) -> usize {
245        self.len()
246    }
247
248    fn next_token(&mut self) -> Option<Self::Token> {
249        let token = self.first()?.clone();
250        self.start += 1;
251        Some(token)
252    }
253
254    /// Split off the next token from the input
255    fn peek_token(&self) -> Option<Self::Token> {
256        Some(self.first()?.clone())
257    }
258
259    fn offset_for<P>(&self, predicate: P) -> Option<usize>
260    where
261        P: Fn(Self::Token) -> bool,
262    {
263        self.iter().position(|b| predicate(b.clone()))
264    }
265
266    fn offset_at(&self, tokens: usize) -> Result<usize, winnow::error::Needed> {
267        if let Some(needed) = tokens.checked_sub(self.len()).and_then(NonZeroUsize::new) {
268            Err(winnow::error::Needed::Size(needed))
269        } else {
270            Ok(tokens)
271        }
272    }
273
274    fn next_slice(&mut self, offset: usize) -> Self::Slice {
275        assert!(self.start + offset <= self.end);
276
277        let next = TokenSlice {
278            stream: self.stream,
279            start: self.start,
280            end: self.start + offset,
281        };
282        self.start += offset;
283        next
284    }
285
286    /// Split off a slice of tokens from the input
287    fn peek_slice(&self, offset: usize) -> Self::Slice {
288        assert!(self.start + offset <= self.end);
289
290        TokenSlice {
291            stream: self.stream,
292            start: self.start,
293            end: self.start + offset,
294        }
295    }
296
297    fn checkpoint(&self) -> Self::Checkpoint {
298        Checkpoint(self.start, self.end)
299    }
300
301    fn reset(&mut self, checkpoint: &Self::Checkpoint) {
302        self.start = checkpoint.0;
303        self.end = checkpoint.1;
304    }
305
306    fn raw(&self) -> &dyn fmt::Debug {
307        self
308    }
309}
310
311impl<'a> winnow::stream::Offset for TokenSlice<'a> {
312    fn offset_from(&self, start: &Self) -> usize {
313        self.start - start.start
314    }
315}
316
317impl<'a> winnow::stream::Offset<Checkpoint> for TokenSlice<'a> {
318    fn offset_from(&self, start: &Checkpoint) -> usize {
319        self.start - start.0
320    }
321}
322
323impl winnow::stream::Offset for Checkpoint {
324    fn offset_from(&self, start: &Self) -> usize {
325        self.0 - start.0
326    }
327}
328
329impl<'a> winnow::stream::StreamIsPartial for TokenSlice<'a> {
330    type PartialState = ();
331
332    fn complete(&mut self) -> Self::PartialState {}
333
334    fn restore_partial(&mut self, _: Self::PartialState) {}
335
336    fn is_partial_supported() -> bool {
337        false
338    }
339}
340
341impl<'a> winnow::stream::FindSlice<&str> for TokenSlice<'a> {
342    fn find_slice(&self, substr: &str) -> Option<std::ops::Range<usize>> {
343        self.iter()
344            .enumerate()
345            .find_map(|(i, b)| if b.value == substr { Some(i..self.end) } else { None })
346    }
347}
348
349#[derive(Clone, Debug)]
350pub struct Checkpoint(usize, usize);
351
352/// The types of tokens.
353#[derive(Debug, PartialEq, Eq, Copy, Clone, Display)]
354#[display(style = "camelCase")]
355pub enum TokenType {
356    /// A number.
357    Number,
358    /// A word.
359    Word,
360    /// An operator.
361    Operator,
362    /// A string.
363    String,
364    /// A keyword.
365    Keyword,
366    /// A type.
367    Type,
368    /// A brace.
369    Brace,
370    /// A hash.
371    Hash,
372    /// A bang.
373    Bang,
374    /// A dollar sign.
375    Dollar,
376    /// Whitespace.
377    Whitespace,
378    /// A comma.
379    Comma,
380    /// A colon.
381    Colon,
382    /// A double colon: `::`
383    DoubleColon,
384    /// A period.
385    Period,
386    /// A double period: `..`.
387    DoublePeriod,
388    /// A double period and a less than: `..<`.
389    DoublePeriodLessThan,
390    /// A line comment.
391    LineComment,
392    /// A block comment.
393    BlockComment,
394    /// A function name.
395    Function,
396    /// Unknown lexemes.
397    Unknown,
398    /// The ? symbol, used for optional values.
399    QuestionMark,
400    /// The @ symbol.
401    At,
402    /// `;`
403    SemiColon,
404}
405
406/// Most KCL tokens correspond to LSP semantic tokens (but not all).
407impl TryFrom<TokenType> for SemanticTokenType {
408    type Error = anyhow::Error;
409    fn try_from(token_type: TokenType) -> Result<Self> {
410        // If you return a new kind of `SemanticTokenType`, make sure to update `SEMANTIC_TOKEN_TYPES`
411        // in the LSP implementation.
412        Ok(match token_type {
413            TokenType::Number => Self::NUMBER,
414            TokenType::Word => Self::VARIABLE,
415            TokenType::Keyword => Self::KEYWORD,
416            TokenType::Type => Self::TYPE,
417            TokenType::Operator => Self::OPERATOR,
418            TokenType::QuestionMark => Self::OPERATOR,
419            TokenType::String => Self::STRING,
420            TokenType::Bang => Self::OPERATOR,
421            TokenType::LineComment => Self::COMMENT,
422            TokenType::BlockComment => Self::COMMENT,
423            TokenType::Function => Self::FUNCTION,
424            TokenType::Whitespace
425            | TokenType::Brace
426            | TokenType::Comma
427            | TokenType::Colon
428            | TokenType::DoubleColon
429            | TokenType::Period
430            | TokenType::DoublePeriod
431            | TokenType::DoublePeriodLessThan
432            | TokenType::Hash
433            | TokenType::Dollar
434            | TokenType::At
435            | TokenType::SemiColon
436            | TokenType::Unknown => {
437                anyhow::bail!("unsupported token type: {:?}", token_type)
438            }
439        })
440    }
441}
442
443impl TokenType {
444    pub fn is_whitespace(&self) -> bool {
445        matches!(self, Self::Whitespace)
446    }
447
448    pub fn is_comment(&self) -> bool {
449        matches!(self, Self::LineComment | Self::BlockComment)
450    }
451}
452
453#[derive(Debug, PartialEq, Eq, Clone)]
454pub struct Token {
455    pub token_type: TokenType,
456    /// Offset in the source code where this token begins.
457    pub start: usize,
458    /// Offset in the source code where this token ends.
459    pub end: usize,
460    pub(super) module_id: ModuleId,
461    pub(super) value: String,
462}
463
464impl ContainsToken<Token> for (TokenType, &str) {
465    fn contains_token(&self, token: Token) -> bool {
466        self.0 == token.token_type && self.1 == token.value
467    }
468}
469
470impl ContainsToken<Token> for TokenType {
471    fn contains_token(&self, token: Token) -> bool {
472        *self == token.token_type
473    }
474}
475
476impl Token {
477    pub fn from_range(
478        range: std::ops::Range<usize>,
479        module_id: ModuleId,
480        token_type: TokenType,
481        value: String,
482    ) -> Self {
483        Self {
484            start: range.start,
485            end: range.end,
486            module_id,
487            value,
488            token_type,
489        }
490    }
491    pub fn is_code_token(&self) -> bool {
492        !matches!(
493            self.token_type,
494            TokenType::Whitespace | TokenType::LineComment | TokenType::BlockComment
495        )
496    }
497
498    pub fn as_source_range(&self) -> SourceRange {
499        SourceRange::new(self.start, self.end, self.module_id)
500    }
501
502    pub fn as_source_ranges(&self) -> Vec<SourceRange> {
503        vec![self.as_source_range()]
504    }
505
506    pub fn visibility_keyword(&self) -> Option<ItemVisibility> {
507        if !matches!(self.token_type, TokenType::Keyword) {
508            return None;
509        }
510        match self.value.as_str() {
511            "export" => Some(ItemVisibility::Export),
512            _ => None,
513        }
514    }
515
516    pub fn numeric_value(&self) -> Option<f64> {
517        if self.token_type != TokenType::Number {
518            return None;
519        }
520        let value = &self.value;
521        let value = value
522            .split_once(|c: char| c == '_' || c.is_ascii_alphabetic())
523            .map(|(s, _)| s)
524            .unwrap_or(value);
525        value.parse().ok()
526    }
527
528    pub fn uint_value(&self) -> Option<u32> {
529        if self.token_type != TokenType::Number {
530            return None;
531        }
532        let value = &self.value;
533        let value = value
534            .split_once(|c: char| c == '_' || c.is_ascii_alphabetic())
535            .map(|(s, _)| s)
536            .unwrap_or(value);
537        value.parse().ok()
538    }
539
540    pub fn numeric_suffix(&self) -> NumericSuffix {
541        if self.token_type != TokenType::Number {
542            return NumericSuffix::None;
543        }
544
545        if self.value.ends_with('_') {
546            return NumericSuffix::Count;
547        }
548
549        for suffix in NUM_SUFFIXES {
550            if self.value.ends_with(suffix) {
551                return suffix.parse().unwrap();
552            }
553        }
554
555        NumericSuffix::None
556    }
557
558    /// Is this token the beginning of a variable/function declaration?
559    /// If so, what kind?
560    /// If not, returns None.
561    pub fn declaration_keyword(&self) -> Option<VariableKind> {
562        if !matches!(self.token_type, TokenType::Keyword) {
563            return None;
564        }
565        Some(match self.value.as_str() {
566            "fn" => VariableKind::Fn,
567            "var" | "let" | "const" => VariableKind::Const,
568            _ => return None,
569        })
570    }
571}
572
573impl From<Token> for SourceRange {
574    fn from(token: Token) -> Self {
575        Self::new(token.start, token.end, token.module_id)
576    }
577}
578
579impl From<&Token> for SourceRange {
580    fn from(token: &Token) -> Self {
581        Self::new(token.start, token.end, token.module_id)
582    }
583}
584
585pub fn lex(s: &str, module_id: ModuleId) -> Result<TokenStream, KclError> {
586    tokeniser::lex(s, module_id).map_err(From::from)
587}
588
589impl From<ParseError<Input<'_>, winnow::error::ContextError>> for KclError {
590    fn from(err: ParseError<Input<'_>, winnow::error::ContextError>) -> Self {
591        let (input, offset): (Vec<char>, usize) = (err.input().chars().collect(), err.offset());
592        let module_id = err.input().state.module_id;
593
594        if offset >= input.len() {
595            // From the winnow docs:
596            //
597            // This is an offset, not an index, and may point to
598            // the end of input (input.len()) on eof errors.
599
600            return KclError::new_lexical(crate::errors::KclErrorDetails::new(
601                "unexpected EOF while parsing".to_owned(),
602                vec![SourceRange::new(offset, offset, module_id)],
603            ));
604        }
605
606        // TODO: Add the Winnow tokenizer context to the error.
607        // See https://github.com/KittyCAD/modeling-app/issues/784
608        let bad_token = &input[offset];
609        // TODO: Add the Winnow parser context to the error.
610        // See https://github.com/KittyCAD/modeling-app/issues/784
611        KclError::new_lexical(crate::errors::KclErrorDetails::new(
612            format!("found unknown token '{bad_token}'"),
613            vec![SourceRange::new(offset, offset + 1, module_id)],
614        ))
615    }
616}