Skip to main content

nodedb_sql/parser/array_stmt/
lexer.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Tiny hand-written tokenizer for array DDL/DML.
4//!
5//! Scope is limited to the four `ARRAY` statements — identifiers,
6//! integer/float literals, single-quoted strings, parentheses, brackets,
7//! commas, dots, double-dots, and a handful of keywords. Keywords are
8//! left as `Ident` tokens; the parser handles them case-insensitively.
9
10use crate::error::SqlError;
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Tok {
14    Ident(String),
15    Int(i64),
16    Float(f64),
17    Str(String),
18    LParen,
19    RParen,
20    LBracket,
21    RBracket,
22    Comma,
23    DotDot,
24    /// `=` — used by `WITH (prefix_bits = N)`.
25    Eq,
26    /// `NULL` literal — kept distinct from `Ident` so insert parsing is
27    /// unambiguous about a bare `NULL` value vs an identifier value.
28    Null,
29}
30
31#[derive(Debug, Clone, PartialEq)]
32pub struct Token {
33    pub tok: Tok,
34    /// Byte offset into the source string — used for error messages.
35    pub pos: usize,
36}
37
38/// Tokenize a SQL slice. Returns `Vec<Token>` on success; any unexpected
39/// character produces `SqlError::Parse`.
40pub fn tokenize(src: &str) -> Result<Vec<Token>, SqlError> {
41    let bytes = src.as_bytes();
42    let mut out = Vec::with_capacity(src.len() / 4);
43    let mut i = 0;
44    while i < bytes.len() {
45        let b = bytes[i];
46        // Whitespace.
47        if b.is_ascii_whitespace() {
48            i += 1;
49            continue;
50        }
51        // Single-line comment `-- ...`.
52        if b == b'-' && i + 1 < bytes.len() && bytes[i + 1] == b'-' {
53            while i < bytes.len() && bytes[i] != b'\n' {
54                i += 1;
55            }
56            continue;
57        }
58        // Punctuation.
59        match b {
60            b'(' => {
61                out.push(Token {
62                    tok: Tok::LParen,
63                    pos: i,
64                });
65                i += 1;
66                continue;
67            }
68            b')' => {
69                out.push(Token {
70                    tok: Tok::RParen,
71                    pos: i,
72                });
73                i += 1;
74                continue;
75            }
76            b'[' => {
77                out.push(Token {
78                    tok: Tok::LBracket,
79                    pos: i,
80                });
81                i += 1;
82                continue;
83            }
84            b']' => {
85                out.push(Token {
86                    tok: Tok::RBracket,
87                    pos: i,
88                });
89                i += 1;
90                continue;
91            }
92            b',' => {
93                out.push(Token {
94                    tok: Tok::Comma,
95                    pos: i,
96                });
97                i += 1;
98                continue;
99            }
100            b'.' if i + 1 < bytes.len() && bytes[i + 1] == b'.' => {
101                out.push(Token {
102                    tok: Tok::DotDot,
103                    pos: i,
104                });
105                i += 2;
106                continue;
107            }
108            b'=' => {
109                out.push(Token {
110                    tok: Tok::Eq,
111                    pos: i,
112                });
113                i += 1;
114                continue;
115            }
116            _ => {}
117        }
118        // Single-quoted string.
119        if b == b'\'' {
120            let start = i + 1;
121            let mut j = start;
122            let mut s = String::new();
123            while j < bytes.len() {
124                if bytes[j] == b'\'' {
125                    // Doubled `''` → literal single quote.
126                    if j + 1 < bytes.len() && bytes[j + 1] == b'\'' {
127                        s.push('\'');
128                        j += 2;
129                        continue;
130                    }
131                    break;
132                }
133                s.push(bytes[j] as char);
134                j += 1;
135            }
136            if j >= bytes.len() {
137                return Err(SqlError::Parse {
138                    detail: format!("unterminated string literal at offset {i}"),
139                });
140            }
141            out.push(Token {
142                tok: Tok::Str(s),
143                pos: i,
144            });
145            i = j + 1;
146            continue;
147        }
148        // Number (int or float, with optional leading `-`).
149        if b.is_ascii_digit() || (b == b'-' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit())
150        {
151            let start = i;
152            let mut j = i;
153            if bytes[j] == b'-' {
154                j += 1;
155            }
156            while j < bytes.len() && bytes[j].is_ascii_digit() {
157                j += 1;
158            }
159            // Float? Require a digit after the dot — `..` is a range token.
160            let is_float = j + 1 < bytes.len()
161                && bytes[j] == b'.'
162                && bytes[j + 1] != b'.'
163                && bytes[j + 1].is_ascii_digit();
164            if is_float {
165                j += 1;
166                while j < bytes.len() && bytes[j].is_ascii_digit() {
167                    j += 1;
168                }
169                let txt = &src[start..j];
170                let f: f64 = txt.parse().map_err(|_| SqlError::Parse {
171                    detail: format!("invalid float literal '{txt}'"),
172                })?;
173                out.push(Token {
174                    tok: Tok::Float(f),
175                    pos: start,
176                });
177            } else {
178                let txt = &src[start..j];
179                let n: i64 = txt.parse().map_err(|_| SqlError::Parse {
180                    detail: format!("invalid integer literal '{txt}'"),
181                })?;
182                out.push(Token {
183                    tok: Tok::Int(n),
184                    pos: start,
185                });
186            }
187            i = j;
188            continue;
189        }
190        // Identifier — letters / digits / underscore. Identifiers may
191        // start with `_` or letter.
192        if b == b'_' || b.is_ascii_alphabetic() {
193            let start = i;
194            let mut j = i;
195            while j < bytes.len() && (bytes[j] == b'_' || bytes[j].is_ascii_alphanumeric()) {
196                j += 1;
197            }
198            let txt = &src[start..j];
199            if txt.eq_ignore_ascii_case("NULL") {
200                out.push(Token {
201                    tok: Tok::Null,
202                    pos: start,
203                });
204            } else {
205                out.push(Token {
206                    tok: Tok::Ident(txt.to_string()),
207                    pos: start,
208                });
209            }
210            i = j;
211            continue;
212        }
213        return Err(SqlError::Parse {
214            detail: format!("unexpected character '{}' at offset {i}", b as char),
215        });
216    }
217    Ok(out)
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn tokenize_simple() {
226        let toks = tokenize("CREATE ARRAY a (1, 2.5, 'x')").unwrap();
227        assert!(matches!(toks[0].tok, Tok::Ident(ref s) if s == "CREATE"));
228        assert!(matches!(toks[1].tok, Tok::Ident(ref s) if s == "ARRAY"));
229        assert!(matches!(toks[3].tok, Tok::LParen));
230        assert!(matches!(toks[4].tok, Tok::Int(1)));
231        assert!(matches!(toks[6].tok, Tok::Float(f) if (f - 2.5).abs() < 1e-9));
232        assert!(matches!(toks[8].tok, Tok::Str(ref s) if s == "x"));
233    }
234
235    #[test]
236    fn tokenize_dotdot_range() {
237        let toks = tokenize("[0..23]").unwrap();
238        assert!(matches!(toks[1].tok, Tok::Int(0)));
239        assert!(matches!(toks[2].tok, Tok::DotDot));
240        assert!(matches!(toks[3].tok, Tok::Int(23)));
241    }
242
243    #[test]
244    fn tokenize_negative() {
245        let toks = tokenize("-7").unwrap();
246        assert!(matches!(toks[0].tok, Tok::Int(-7)));
247    }
248}