Skip to main content

regorus/
lexer.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4// SAFETY: Arithmetic operations in this module are safe by design:
5// 1. MAX_COL=1024 prevents column counter overflow (enforced by advance_col)
6// 2. File size is capped by MAX_FILE_BYTES at load time
7// 3. Total line count is capped by MAX_LINES at load time
8// 4. State-modifying operations (advance_col/advance_line) use checked arithmetic
9// 5. Remaining arithmetic is for bounded calculations (spans, error reporting)
10//    where operands are constrained by MAX_COL and file size/line limits
11// 6. Defensive saturating_sub used for subtractions that could theoretically underflow
12use crate::*;
13use core::{
14    cmp,
15    fmt::{self, Debug, Formatter},
16    iter::Peekable,
17    ops::Range,
18    str::CharIndices,
19};
20
21use crate::Value;
22
23use anyhow::{anyhow, bail, Result};
24
25#[inline]
26fn check_memory_limit() -> Result<()> {
27    crate::utils::limits::check_memory_limit_if_needed().map_err(|err| anyhow!(err))
28}
29
30// Maximum column width to prevent overflow and catch pathological input.
31// Lines exceeding this are likely minified/generated code or attack attempts.
32const MAX_COL: u32 = 1024;
33// Maximum allowed policy file size in bytes (1 MiB) to reject pathological inputs early.
34const MAX_FILE_BYTES: usize = 1_048_576;
35// Maximum allowed number of lines to avoid pathological or minified inputs.
36const MAX_LINES: usize = 20_000;
37
38#[inline]
39fn usize_to_u32(value: usize) -> Result<u32> {
40    u32::try_from(value).map_err(|_| anyhow!("value exceeds u32::MAX"))
41}
42
43#[inline]
44fn span_range(start: u32, end: u32) -> Option<Range<usize>> {
45    let s = usize::try_from(start).ok()?;
46    let e = usize::try_from(end).ok()?;
47    Some(s..e)
48}
49
50#[derive(Clone)]
51#[cfg_attr(feature = "ast", derive(serde::Serialize))]
52struct SourceInternal {
53    pub file: String,
54    pub contents: String,
55    #[cfg_attr(feature = "ast", serde(skip_serializing))]
56    pub lines: Vec<(u32, u32)>,
57}
58
59/// A policy file.
60#[derive(Clone)]
61#[cfg_attr(feature = "ast", derive(serde::Serialize))]
62pub struct Source {
63    #[cfg_attr(feature = "ast", serde(flatten))]
64    src: Rc<SourceInternal>,
65}
66
67impl Source {
68    /// The path associated with the policy file.
69    pub fn get_path(&self) -> &String {
70        &self.src.file
71    }
72
73    /// The contents of the policy file.
74    pub fn get_contents(&self) -> &String {
75        &self.src.contents
76    }
77}
78
79impl cmp::Ord for Source {
80    fn cmp(&self, other: &Source) -> cmp::Ordering {
81        Rc::as_ptr(&self.src).cmp(&Rc::as_ptr(&other.src))
82    }
83}
84
85impl cmp::PartialOrd for Source {
86    fn partial_cmp(&self, other: &Source) -> Option<cmp::Ordering> {
87        Some(self.cmp(other))
88    }
89}
90
91impl cmp::PartialEq for Source {
92    fn eq(&self, other: &Source) -> bool {
93        Rc::as_ptr(&self.src) == Rc::as_ptr(&other.src)
94    }
95}
96
97impl cmp::Eq for Source {}
98
99#[cfg(feature = "std")]
100impl core::hash::Hash for Source {
101    fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
102        Rc::as_ptr(&self.src).hash(state);
103    }
104}
105
106impl Debug for Source {
107    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
108        self.src.file.fmt(f)
109    }
110}
111
112#[derive(Clone)]
113pub struct SourceStr {
114    source: Source,
115    start: u32,
116    end: u32,
117}
118
119impl Debug for SourceStr {
120    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
121        self.text().fmt(f)
122    }
123}
124
125impl fmt::Display for SourceStr {
126    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
127        fmt::Display::fmt(&self.text(), f)
128    }
129}
130
131impl SourceStr {
132    pub const fn new(source: Source, start: u32, end: u32) -> Self {
133        Self { source, start, end }
134    }
135
136    pub fn text(&self) -> &str {
137        // Use safe slicing to avoid panics on malformed spans
138        span_range(self.start, self.end).map_or("<invalid-span>", |range| {
139            self.source
140                .contents()
141                .get(range)
142                .unwrap_or("<invalid-span>")
143        })
144    }
145
146    pub fn clone_empty(&self) -> SourceStr {
147        Self {
148            source: self.source.clone(),
149            start: 0,
150            end: 0,
151        }
152    }
153}
154
155impl cmp::PartialEq for SourceStr {
156    fn eq(&self, other: &Self) -> bool {
157        self.text().eq(other.text())
158    }
159}
160
161impl cmp::Eq for SourceStr {}
162
163impl cmp::PartialOrd for SourceStr {
164    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
165        Some(self.cmp(other))
166    }
167}
168
169impl cmp::Ord for SourceStr {
170    fn cmp(&self, other: &Self) -> cmp::Ordering {
171        self.text().cmp(other.text())
172    }
173}
174
175impl Source {
176    pub fn from_contents(file: String, contents: String) -> Result<Source> {
177        if contents.len() > MAX_FILE_BYTES {
178            bail!("{file} exceeds maximum allowed policy file size {MAX_FILE_BYTES} bytes");
179        }
180        let mut lines = vec![];
181        let mut prev_ch = ' ';
182        let mut prev_pos = 0_u32;
183        let mut start = 0_u32;
184        for (i, ch) in contents.char_indices() {
185            let i_u32 = usize_to_u32(i)?;
186            if ch == '\n' {
187                let end = match prev_ch {
188                    '\r' => prev_pos,
189                    _ => i_u32,
190                };
191                if lines.len() >= MAX_LINES {
192                    bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
193                }
194                lines.push((start, end));
195                // Enforce the current global memory cap after recording each line span.
196                check_memory_limit()?;
197                start = i_u32.saturating_add(1);
198            }
199            prev_ch = ch;
200            prev_pos = i_u32;
201        }
202
203        let start_usize = usize::try_from(start).unwrap_or(usize::MAX);
204        if start_usize < contents.len() {
205            if lines.len() >= MAX_LINES {
206                bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
207            }
208            lines.push((start, usize_to_u32(contents.len())?));
209            // Enforce the global limit after appending the final line span.
210            check_memory_limit()?;
211        } else if contents.is_empty() {
212            lines.push((0, 0));
213            // Enforce the global limit even for empty sources.
214            check_memory_limit()?;
215        } else {
216            let s = usize_to_u32(contents.len().saturating_sub(1))?;
217            if lines.len() >= MAX_LINES {
218                bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
219            }
220            lines.push((s, s));
221            // Enforce the global limit after storing the trailing span.
222            check_memory_limit()?;
223        }
224        Ok(Self {
225            src: Rc::new(SourceInternal {
226                file,
227                contents,
228                lines,
229            }),
230        })
231    }
232
233    #[cfg(feature = "std")]
234    pub fn from_file<P: AsRef<std::path::Path>>(path: P) -> Result<Source> {
235        let contents = match std::fs::read_to_string(&path) {
236            Ok(c) => c,
237            Err(e) => bail!("Failed to read {}. {e}", path.as_ref().display()),
238        };
239        // TODO: retain path instead of converting to string
240        Self::from_contents(path.as_ref().to_string_lossy().to_string(), contents)
241    }
242
243    pub fn file(&self) -> &String {
244        &self.src.file
245    }
246    pub fn contents(&self) -> &String {
247        &self.src.contents
248    }
249    pub fn line(&self, idx: u32) -> &str {
250        let idx = usize::try_from(idx).unwrap_or(usize::MAX);
251        match self.src.lines.get(idx) {
252            Some(&(start, end)) => self
253                .src
254                .contents
255                .get(span_range(start, end).unwrap_or(0..0))
256                .unwrap_or(""),
257            None => "",
258        }
259    }
260
261    pub fn message(&self, line: u32, col: u32, kind: &str, msg: &str) -> String {
262        if usize::try_from(line).unwrap_or(usize::MAX) > self.src.lines.len() {
263            return format!("{}: invalid line {} specified", self.src.file, line);
264        }
265
266        let line_str = format!("{line}");
267        let line_num_width = line_str.len().saturating_add(1);
268        let col_spaces = usize::try_from(col).unwrap_or(0).saturating_sub(1);
269
270        format!(
271            "\n--> {}:{}:{}\n{:<line_num_width$}|\n\
272		{:<line_num_width$}| {}\n\
273		{:<line_num_width$}| {:<col_spaces$}^\n\
274		{}: {}",
275            self.src.file,
276            line,
277            col,
278            "",
279            line,
280            self.line(line.saturating_sub(1)),
281            "",
282            "",
283            kind,
284            msg
285        )
286    }
287
288    pub fn error(&self, line: u32, col: u32, msg: &str) -> anyhow::Error {
289        anyhow!(self.message(line, col, "error", msg))
290    }
291}
292
293#[derive(Clone)]
294#[cfg_attr(feature = "ast", derive(serde::Serialize))]
295pub struct Span {
296    #[cfg_attr(feature = "ast", serde(skip_serializing))]
297    pub source: Source,
298    pub line: u32,
299    pub col: u32,
300    pub start: u32,
301    pub end: u32,
302}
303
304impl Span {
305    pub fn text(&self) -> &str {
306        // Use safe slicing to avoid panics on malformed spans
307        span_range(self.start, self.end).map_or("<invalid-span>", |range| {
308            self.source
309                .contents()
310                .get(range)
311                .unwrap_or("<invalid-span>")
312        })
313    }
314
315    pub fn source_str(&self) -> SourceStr {
316        SourceStr::new(self.source.clone(), self.start, self.end)
317    }
318
319    pub fn message(&self, kind: &str, msg: &str) -> String {
320        self.source.message(self.line, self.col, kind, msg)
321    }
322
323    pub fn error(&self, msg: &str) -> anyhow::Error {
324        self.source.error(self.line, self.col, msg)
325    }
326}
327
328impl Debug for Span {
329    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
330        let t = self.text().escape_debug().to_string();
331        let max = 32;
332        let (txt, trailer) = if t.len() > max {
333            (&t[0..max], "...")
334        } else {
335            (t.as_str(), "")
336        };
337
338        f.write_fmt(format_args!(
339            "{}:{}:{}:{}, \"{}{}\"",
340            self.line, self.col, self.start, self.end, txt, trailer
341        ))
342    }
343}
344
345#[cfg(feature = "azure-rbac")]
346#[derive(Debug, PartialEq, Eq, Clone)]
347pub enum AzureRbacTokenKind {
348    At,         // @ symbol for attribute sources (@Request, @Resource, etc.)
349    LogicalAnd, // && operator
350    LogicalOr,  // || operator
351}
352
353#[derive(Debug, PartialEq, Eq, Clone)]
354pub enum TokenKind {
355    Symbol,
356    String,
357    RawString,
358    Number,
359    Ident,
360    Eof,
361    // Azure RBAC-specific tokens
362    #[cfg(feature = "azure-rbac")]
363    AzureRbac(AzureRbacTokenKind),
364}
365
366#[derive(Debug, Clone)]
367pub struct Token(pub TokenKind, pub Span);
368
369#[derive(Clone)]
370pub struct Lexer<'source> {
371    source: Source,
372    iter: Peekable<CharIndices<'source>>,
373    line: u32,
374    col: u32,
375    unknown_char_is_symbol: bool,
376    allow_slash_star_escape: bool,
377    comment_starts_with_double_slash: bool,
378    double_colon_token: bool,
379    #[cfg(feature = "azure-rbac")]
380    enable_rbac_tokens: bool,
381    #[cfg(feature = "azure-rbac")]
382    allow_single_quoted_strings: bool,
383}
384
385impl<'source> fmt::Debug for Lexer<'source> {
386    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
387        f.debug_struct("Lexer").finish_non_exhaustive()
388    }
389}
390
391impl<'source> Lexer<'source> {
392    pub fn new(source: &'source Source) -> Self {
393        Self {
394            source: source.clone(),
395            iter: source.contents().char_indices().peekable(),
396            line: 1,
397            col: 1,
398            unknown_char_is_symbol: false,
399            allow_slash_star_escape: false,
400            comment_starts_with_double_slash: false,
401            double_colon_token: false,
402            #[cfg(feature = "azure-rbac")]
403            enable_rbac_tokens: false,
404            #[cfg(feature = "azure-rbac")]
405            allow_single_quoted_strings: false,
406        }
407    }
408
409    pub const fn set_unknown_char_is_symbol(&mut self, b: bool) {
410        self.unknown_char_is_symbol = b;
411    }
412
413    pub const fn set_allow_slash_star_escape(&mut self, b: bool) {
414        self.allow_slash_star_escape = b;
415    }
416
417    pub const fn set_comment_starts_with_double_slash(&mut self, b: bool) {
418        self.comment_starts_with_double_slash = b;
419    }
420
421    pub const fn set_double_colon_token(&mut self, b: bool) {
422        self.double_colon_token = b;
423    }
424
425    #[cfg(feature = "azure-rbac")]
426    pub const fn set_enable_rbac_tokens(&mut self, b: bool) {
427        self.enable_rbac_tokens = b;
428    }
429
430    #[cfg(feature = "azure-rbac")]
431    pub const fn set_allow_single_quoted_strings(&mut self, b: bool) {
432        self.allow_single_quoted_strings = b;
433    }
434
435    fn peek(&mut self) -> (usize, char) {
436        match self.iter.peek() {
437            Some(&(index, chr)) => (index, chr),
438            _ => (self.source.contents().len(), '\x00'),
439        }
440    }
441
442    #[inline]
443    fn advance_col(&mut self, delta: u32) -> Result<()> {
444        let new_col = self
445            .col
446            .checked_add(delta)
447            .filter(|&c| c <= MAX_COL)
448            .ok_or_else(|| {
449                self.source.error(
450                    self.line,
451                    self.col,
452                    &format!("line exceeds maximum column width of {MAX_COL}"),
453                )
454            })?;
455        self.col = new_col;
456        Ok(())
457    }
458
459    #[inline]
460    fn advance_line(&mut self, delta: u32) -> Result<()> {
461        self.line = self.line.checked_add(delta).ok_or_else(|| {
462            self.source
463                .error(self.line, self.col, "line number overflow")
464        })?;
465        Ok(())
466    }
467
468    fn peekahead(&mut self, n: usize) -> (usize, char) {
469        match self.iter.clone().nth(n) {
470            Some((index, chr)) => (index, chr),
471            _ => (self.source.contents().len(), '\x00'),
472        }
473    }
474
475    fn read_ident(&mut self) -> Result<Token> {
476        let start = self.peek().0;
477        let col = self.col;
478        loop {
479            let ch = self.peek().1;
480            if ch.is_ascii_alphanumeric() || ch == '_' {
481                self.iter.next();
482            } else {
483                break;
484            }
485        }
486        let end = self.peek().0;
487        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
488        Ok(Token(
489            TokenKind::Ident,
490            Span {
491                source: self.source.clone(),
492                line: self.line,
493                col,
494                start: usize_to_u32(start)?,
495                end: usize_to_u32(end)?,
496            },
497        ))
498    }
499
500    fn read_digits(&mut self) {
501        while self.peek().1.is_ascii_digit() {
502            self.iter.next();
503        }
504    }
505
506    // See https://www.json.org/json-en.html for number's grammar
507    fn read_number(&mut self) -> Result<Token> {
508        let (start, chr) = self.peek();
509        let col = self.col;
510        self.iter.next();
511
512        // Read integer part.
513        if chr != '0' {
514            // Starts with 1.. or 9. Read digits.
515            self.read_digits();
516        }
517
518        // Read fraction part
519        // . must be followed by at least 1 digit.
520        if self.peek().1 == '.' && self.peekahead(1).1.is_ascii_digit() {
521            self.iter.next(); // .
522            self.read_digits();
523        }
524
525        // Read exponent part
526        let exp_ch = self.peek().1;
527        if exp_ch == 'e' || exp_ch == 'E' {
528            self.iter.next();
529            // e must be followed by an optional sign and digits
530            if matches!(self.peek().1, '+' | '-') {
531                self.iter.next();
532            }
533            // Read digits. Absence of digit will be validated by serde later.
534            self.read_digits();
535        }
536
537        let end = self.peek().0;
538        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
539
540        // Check for invalid number.Valid number cannot be followed by
541        // these characters:
542        let trailing_ch = self.peek().1;
543        if trailing_ch == '_' || trailing_ch == '.' || trailing_ch.is_ascii_alphanumeric() {
544            return Err(self.source.error(self.line, self.col, "invalid number"));
545        }
546
547        // Ensure that the number is parsable in Rust.
548        let num_slice = self
549            .source
550            .contents()
551            .get(start..end)
552            .ok_or_else(|| self.source.error(self.line, col, "invalid number span"))?;
553
554        let parsed_number = match serde_json::from_str::<Value>(num_slice) {
555            Ok(value) => value,
556            Err(e) => {
557                let serde_msg = &e.to_string();
558                let msg = match &serde_msg {
559                    m if m.contains("out of range") => "out of range",
560                    m if m.contains("invalid number") => "invalid number",
561                    m if m.contains("expected value") => "expected value",
562                    m if m.contains("trailing characters") => "trailing characters",
563                    m => m.to_owned(),
564                };
565
566                bail!(
567                    "{} {}",
568                    self.source.error(
569                        self.line,
570                        col,
571                        "invalid number. serde_json cannot parse number:"
572                    ),
573                    msg
574                )
575            }
576        };
577
578        // Enforce the global memory limit after serde allocates the temporary Value.
579        check_memory_limit()?;
580        drop(parsed_number);
581
582        Ok(Token(
583            TokenKind::Number,
584            Span {
585                source: self.source.clone(),
586                line: self.line,
587                col,
588                start: usize_to_u32(start)?,
589                end: usize_to_u32(end)?,
590            },
591        ))
592    }
593
594    fn read_raw_string(&mut self) -> Result<Token> {
595        self.iter.next();
596        self.advance_col(1)?;
597        let (start, _) = self.peek();
598        let (line, col) = (self.line, self.col);
599        loop {
600            let (_, ch) = self.peek();
601            self.iter.next();
602            match ch {
603                '`' => {
604                    self.advance_col(1)?;
605                    break;
606                }
607                '\x00' => {
608                    return Err(self.source.error(line, col, "unmatched `"));
609                }
610                '\t' => self.advance_col(4)?,
611                '\n' => {
612                    self.advance_line(1)?;
613                    self.col = 1;
614                }
615                _ => self.advance_col(1)?,
616            }
617        }
618        let end = self.peek().0;
619        if end <= start {
620            // Guard against invalid span that would underflow end - 1
621            return Err(self.source.error(line, col, "invalid raw string span"));
622        }
623        check_memory_limit()?;
624        Ok(Token(
625            TokenKind::RawString,
626            Span {
627                source: self.source.clone(),
628                line,
629                col,
630                start: usize_to_u32(start)?,
631                end: usize_to_u32(end)?.saturating_sub(1),
632            },
633        ))
634    }
635
636    fn read_string(&mut self) -> Result<Token> {
637        let (line, col) = (self.line, self.col);
638        self.iter.next();
639        self.advance_col(1)?;
640        let (start, _) = self.peek();
641        loop {
642            let (offset, ch) = self.peek();
643            match ch {
644                '"' | '\x00' => {
645                    break;
646                }
647                '\\' => {
648                    self.iter.next();
649                    let (_, escape_ch) = self.peek();
650                    self.iter.next();
651                    match escape_ch {
652                        // json escape sequence
653                        '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't' => (),
654                        '*' if self.allow_slash_star_escape => (),
655                        'u' => {
656                            for _i in 0..4 {
657                                let (hex_offset, hex_ch) = self.peek();
658                                let rel = usize_to_u32(hex_offset.saturating_sub(start))?;
659                                let cursor_col = self.col.saturating_add(rel);
660                                if !hex_ch.is_ascii_hexdigit() {
661                                    return Err(self.source.error(
662                                        line,
663                                        cursor_col,
664                                        "invalid hex escape sequence",
665                                    ));
666                                }
667                                self.iter.next();
668                            }
669                        }
670                        _ => {
671                            let cursor_col = self
672                                .col
673                                .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
674                            return Err(self.source.error(
675                                line,
676                                cursor_col,
677                                "invalid escape sequence",
678                            ));
679                        }
680                    }
681                }
682                _ => {
683                    // check for valid json chars
684                    let cursor_col = self
685                        .col
686                        .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
687                    if !('\u{0020}'..='\u{10FFFF}').contains(&ch) {
688                        return Err(self.source.error(
689                            line,
690                            cursor_col,
691                            "invalid character in string",
692                        ));
693                    }
694                    self.iter.next();
695                }
696            }
697        }
698
699        if self.peek().1 != '"' {
700            return Err(self.source.error(line, col, "unmatched \""));
701        }
702
703        self.iter.next();
704        let end = self.peek().0;
705        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
706
707        if start == 0 || end <= start {
708            // Reject invalid spans before slicing/serde to avoid panic
709            return Err(self.source.error(line, col, "invalid string span"));
710        }
711
712        let str_slice = self
713            .source
714            .contents()
715            .get(start.saturating_sub(1)..end)
716            .ok_or_else(|| self.source.error(line, col, "invalid string span"))?;
717
718        // Ensure that the string is parsable in Rust.
719        match serde_json::from_str::<String>(str_slice) {
720            Ok(_) => (),
721            Err(e) => {
722                let serde_msg = &e.to_string();
723                let msg = serde_msg;
724                bail!(
725                    "{} {}",
726                    self.source
727                        .error(self.line, col, "serde_json cannot parse string:"),
728                    msg
729                )
730            }
731        }
732
733        check_memory_limit()?;
734
735        Ok(Token(
736            TokenKind::String,
737            Span {
738                source: self.source.clone(),
739                line,
740                col: col.saturating_add(1),
741                start: usize_to_u32(start)?,
742                end: usize_to_u32(end)?.saturating_sub(1),
743            },
744        ))
745    }
746
747    #[cfg(feature = "azure-rbac")]
748    fn read_single_quoted_string(&mut self) -> Result<Token> {
749        let (line, col) = (self.line, self.col);
750        self.iter.next();
751        self.advance_col(1)?;
752        let (start, _) = self.peek();
753        loop {
754            let (offset, ch) = self.peek();
755            let cursor_col = self
756                .col
757                .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
758            match ch {
759                '\'' | '\x00' => {
760                    break;
761                }
762                '\\' => {
763                    self.iter.next();
764                    let (_, escape_ch) = self.peek();
765                    self.iter.next();
766                    match escape_ch {
767                        // Basic escape sequences for single-quoted strings
768                        '\'' | '\\' | 'n' | 'r' | 't' => (),
769                        _ => {
770                            return Err(self.source.error(
771                                line,
772                                cursor_col,
773                                "invalid escape sequence",
774                            ))
775                        }
776                    }
777                }
778                _ => {
779                    // check for valid chars
780                    let inner_cursor_col = self
781                        .col
782                        .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
783                    if !('\u{0020}'..='\u{10FFFF}').contains(&ch) {
784                        return Err(self.source.error(
785                            line,
786                            inner_cursor_col,
787                            "invalid character in string",
788                        ));
789                    }
790                    self.iter.next();
791                }
792            }
793        }
794
795        if self.peek().1 != '\'' {
796            return Err(self.source.error(line, col, "unmatched '"));
797        }
798
799        self.iter.next();
800        let end = self.peek().0;
801        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
802
803        check_memory_limit()?;
804
805        Ok(Token(
806            TokenKind::String,
807            Span {
808                source: self.source.clone(),
809                line,
810                col: col.saturating_add(1),
811                start: usize_to_u32(start)?,
812                end: usize_to_u32(end)?.saturating_sub(1),
813            },
814        ))
815    }
816
817    #[inline]
818    fn skip_past_newline(&mut self) -> Result<()> {
819        self.iter.next();
820        loop {
821            match self.peek().1 {
822                '\n' | '\x00' => break,
823                _ => self.iter.next(),
824            };
825        }
826        Ok(())
827    }
828
829    fn skip_ws(&mut self) -> Result<()> {
830        // Only the 4 json whitespace characters are recognized.
831        // https://www.crockford.com/mckeeman.html.
832        // Additionally, comments are also skipped.
833        // A tab is considered 4 space characters.
834        loop {
835            match self.peek().1 {
836                ' ' => self.advance_col(1)?,
837                '\t' => self.advance_col(4)?,
838                '\r' => {
839                    if self.peekahead(1).1 != '\n' {
840                        return Err(self.source.error(
841                            self.line,
842                            self.col,
843                            "\\r must be followed by \\n",
844                        ));
845                    }
846                }
847                '\n' => {
848                    self.col = 1;
849                    self.advance_line(1)?;
850                }
851                '#' if !self.comment_starts_with_double_slash => {
852                    self.skip_past_newline()?;
853                    continue;
854                }
855                '/' if self.comment_starts_with_double_slash && self.peekahead(1).1 == '/' => {
856                    self.skip_past_newline()?;
857                    continue;
858                }
859                _ => break,
860            }
861            self.iter.next();
862        }
863        Ok(())
864    }
865
866    pub fn next_token(&mut self) -> Result<Token> {
867        self.skip_ws()?;
868
869        let (start, chr) = self.peek();
870        let start_u32 = usize_to_u32(start)?;
871        let col = self.col;
872
873        let token = match chr {
874            // Special case for - followed by digit which is a
875            // negative json number.
876            // . followed by digit is invalid number.
877            '-' | '.' if self.peekahead(1).1.is_ascii_digit() => self.read_number()?,
878            // grouping characters
879            '{' | '}' | '[' | ']' | '(' | ')' |
880            // arith operator
881            '+' | '-' | '*' | '/' | '%' |
882            // separators
883            ',' | ';' | '.' => {
884                self.advance_col(1)?;
885                self.iter.next();
886                Token(TokenKind::Symbol, Span {
887                    source: self.source.clone(),
888                    line: self.line,
889                    col,
890                    start: start_u32,
891                    end: start_u32.saturating_add(1),
892                })
893            }
894            #[cfg(feature = "azure-rbac")]
895            // RBAC logical AND operator (&&)
896            '&' if self.enable_rbac_tokens && self.peekahead(1).1 == '&' => {
897                self.advance_col(2)?;
898                self.iter.next();
899                self.iter.next();
900                Token(TokenKind::AzureRbac(AzureRbacTokenKind::LogicalAnd), Span {
901                    source: self.source.clone(),
902                    line: self.line,
903                    col,
904                    start: start_u32,
905                    end: start_u32.saturating_add(2),
906                })
907            }
908            #[cfg(feature = "azure-rbac")]
909            // RBAC logical OR operator (||)
910            '|' if self.enable_rbac_tokens && self.peekahead(1).1 == '|' => {
911                self.advance_col(2)?;
912                self.iter.next();
913                self.iter.next();
914                Token(TokenKind::AzureRbac(AzureRbacTokenKind::LogicalOr), Span {
915                    source: self.source.clone(),
916                    line: self.line,
917                    col,
918                    start: start_u32,
919                    end: start_u32.saturating_add(2),
920                })
921            }
922            // Generic bin operators (when RBAC tokens not enabled or single & |)
923            '&' | '|' => {
924                self.advance_col(1)?;
925                self.iter.next();
926                Token(TokenKind::Symbol, Span {
927                    source: self.source.clone(),
928                    line: self.line,
929                    col,
930                    start: start_u32,
931                    end: start_u32.saturating_add(1),
932                })
933            }
934            ':' => {
935                self.advance_col(1)?;
936                self.iter.next();
937                let mut end = start_u32.saturating_add(1);
938                if self.peek().1 == '=' || (self.peek().1 == ':' && self.double_colon_token) {
939                    self.advance_col(1)?;
940                    self.iter.next();
941                    end = end.saturating_add(1);
942                }
943                Token(TokenKind::Symbol, Span {
944                    source: self.source.clone(),
945                    line: self.line,
946                    col,
947                    start: start_u32,
948                    end,
949                })
950            }
951            // < <= > >= = ==
952            '<' | '>' | '=' => {
953                self.advance_col(1)?;
954                self.iter.next();
955                if self.peek().1 == '=' {
956                    self.advance_col(1)?;
957                    self.iter.next();
958                };
959                Token(TokenKind::Symbol, Span {
960                    source: self.source.clone(),
961                    line: self.line,
962                    col,
963                    start: start_u32,
964                    end: usize_to_u32(self.peek().0)?,
965                })
966            }
967            '!' if self.peekahead(1).1 == '=' => {
968                self.advance_col(2)?;
969                self.iter.next();
970                self.iter.next();
971                Token(TokenKind::Symbol, Span {
972                    source: self.source.clone(),
973                    line: self.line,
974                    col,
975                    start: start_u32,
976                    end: usize_to_u32(self.peek().0)?,
977                })
978            }
979            #[cfg(feature = "azure-rbac")]
980            // RBAC @ token for attribute references
981            '@' if self.enable_rbac_tokens => {
982                self.advance_col(1)?;
983                self.iter.next();
984                Token(TokenKind::AzureRbac(AzureRbacTokenKind::At), Span {
985                    source: self.source.clone(),
986                    line: self.line,
987                    col,
988                    start: start_u32,
989                    end: start_u32.saturating_add(1),
990                })
991            }
992            '"' => self.read_string()?,
993            #[cfg(feature = "azure-rbac")]
994            '\'' if self.allow_single_quoted_strings => self.read_single_quoted_string()?,
995            '`' => self.read_raw_string()?,
996            '\x00' => Token(TokenKind::Eof, Span {
997                source: self.source.clone(),
998                line: self.line,
999                col,
1000                start: start_u32,
1001                end: start_u32,
1002            }),
1003            _ if chr.is_ascii_digit() => self.read_number()?,
1004            _ if chr.is_ascii_alphabetic() || chr == '_' => {
1005                let mut ident = self.read_ident()?;
1006                if ident.1.text() == "set" && self.peek().1 == '(' {
1007                    // set immediately followed by ( is treated as set( if
1008                    // the next token is ).
1009                    let state = (self.iter.clone(), self.line, self.col);
1010                    self.iter.next();
1011
1012                    // Check it next token is ).
1013                    let next_tok = self.next_token()?;
1014                    let is_setp = next_tok.1.text() == ")";
1015
1016                    // Restore state
1017                    (self.iter, self.line, self.col) = state;
1018
1019                    if is_setp {
1020                        self.iter.next();
1021                        self.advance_col(1)?;
1022                        ident.1.end = ident.1.end.saturating_add(1);
1023                    }
1024                }
1025                ident
1026            }
1027            _ if self.unknown_char_is_symbol => {
1028                self.advance_col(1)?;
1029                self.iter.next();
1030                Token(TokenKind::Symbol, Span {
1031                    source: self.source.clone(),
1032                    line: self.line,
1033                    col,
1034                    start: start_u32,
1035                    end: start_u32.saturating_add(1),
1036                })
1037            }
1038            _ => return Err(self.source.error(self.line, self.col, "invalid character")),
1039        };
1040
1041        check_memory_limit()?;
1042        Ok(token)
1043    }
1044}