Skip to main content

nxs/
lexer.rs

1use crate::error::{NxsError, Result};
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Token {
5    // Sigils + their values
6    Int(i64),
7    Float(f64),
8    Bool(bool),
9    Keyword(String),
10    Str(String),
11    Time(i64), // unix nanoseconds
12    Binary(Vec<u8>),
13    Link(i32),
14    Macro(String),
15    Null,
16
17    // Structure
18    Ident(String),
19    Colon,
20    LBrace,
21    RBrace,
22    LBracket,
23    RBracket,
24    Comma,
25    LParen,
26    RParen,
27
28    Eof,
29}
30
31pub struct Lexer {
32    input: Vec<char>,
33    pos: usize,
34}
35
36impl Lexer {
37    pub fn new(input: &str) -> Self {
38        Lexer {
39            input: input.chars().collect(),
40            pos: 0,
41        }
42    }
43
44    fn peek(&self) -> Option<char> {
45        self.input.get(self.pos).copied()
46    }
47
48    fn advance(&mut self) -> Option<char> {
49        let c = self.input.get(self.pos).copied();
50        self.pos += 1;
51        c
52    }
53
54    fn skip_whitespace_and_comments(&mut self) {
55        while let Some(c) = self.peek() {
56            if c == '#' {
57                while let Some(c) = self.peek() {
58                    self.advance();
59                    if c == '\n' {
60                        break;
61                    }
62                }
63            } else if c.is_whitespace() {
64                self.advance();
65            } else {
66                break;
67            }
68        }
69    }
70
71    fn read_while<F: Fn(char) -> bool>(&mut self, pred: F) -> String {
72        let mut s = String::new();
73        while let Some(c) = self.peek() {
74            if pred(c) {
75                s.push(c);
76                self.advance();
77            } else {
78                break;
79            }
80        }
81        s
82    }
83
84    fn read_string(&mut self) -> Result<String> {
85        // opening `"` already consumed
86        let mut s = String::new();
87        loop {
88            match self.advance() {
89                None => return Err(NxsError::ParseError("unterminated string".into())),
90                Some('"') => break,
91                Some('\\') => match self.advance() {
92                    Some('\\') => s.push('\\'),
93                    Some('"') => s.push('"'),
94                    Some('n') => s.push('\n'),
95                    Some('r') => s.push('\r'),
96                    Some('t') => s.push('\t'),
97                    Some('0') => s.push('\0'),
98                    Some('u') => {
99                        let hex: String = (0..4).filter_map(|_| self.advance()).collect();
100                        let code = u32::from_str_radix(&hex, 16)
101                            .map_err(|_| NxsError::ParseError(format!("bad \\u escape: {hex}")))?;
102                        let ch = char::from_u32(code).ok_or_else(|| {
103                            NxsError::ParseError(format!("invalid unicode: {code}"))
104                        })?;
105                        s.push(ch);
106                    }
107                    Some('U') => {
108                        let hex: String = (0..8).filter_map(|_| self.advance()).collect();
109                        let code = u32::from_str_radix(&hex, 16)
110                            .map_err(|_| NxsError::ParseError(format!("bad \\U escape: {hex}")))?;
111                        let ch = char::from_u32(code).ok_or_else(|| {
112                            NxsError::ParseError(format!("invalid unicode: {code}"))
113                        })?;
114                        s.push(ch);
115                    }
116                    Some(c) => return Err(NxsError::BadEscape(c)),
117                    None => return Err(NxsError::ParseError("unterminated escape".into())),
118                },
119                Some(c) => s.push(c),
120            }
121        }
122        Ok(s)
123    }
124
125    fn read_binary(&mut self) -> Result<Vec<u8>> {
126        // opening `<` already consumed; expect hex digits until `>`
127        let mut hex = String::new();
128        loop {
129            match self.advance() {
130                Some('>') => break,
131                Some(c) if c.is_ascii_hexdigit() || c.is_whitespace() => {
132                    if c.is_ascii_hexdigit() {
133                        hex.push(c);
134                    }
135                }
136                Some(c) => {
137                    return Err(NxsError::ParseError(format!(
138                        "unexpected char in binary: '{c}'"
139                    )));
140                }
141                None => return Err(NxsError::ParseError("unterminated binary literal".into())),
142            }
143        }
144        if hex.len() % 2 != 0 {
145            return Err(NxsError::ParseError(
146                "binary hex must have even number of digits".into(),
147            ));
148        }
149        (0..hex.len())
150            .step_by(2)
151            .map(|i| {
152                u8::from_str_radix(&hex[i..i + 2], 16)
153                    .map_err(|_| NxsError::ParseError(format!("bad hex byte: {}", &hex[i..i + 2])))
154            })
155            .collect()
156    }
157
158    fn read_macro_expr(&mut self) -> String {
159        // consume to end of line or comma or closing brace
160        let mut s = String::new();
161        while let Some(c) = self.peek() {
162            if c == '\n' || c == ',' || c == '}' {
163                break;
164            }
165            s.push(c);
166            self.advance();
167        }
168        s.trim().to_string()
169    }
170
171    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
172        let mut tokens = Vec::new();
173        loop {
174            self.skip_whitespace_and_comments();
175            match self.peek() {
176                None => {
177                    tokens.push(Token::Eof);
178                    break;
179                }
180                Some(c) => {
181                    self.advance();
182                    let tok = match c {
183                        '{' => Token::LBrace,
184                        '}' => Token::RBrace,
185                        '[' => Token::LBracket,
186                        ']' => Token::RBracket,
187                        '(' => Token::LParen,
188                        ')' => Token::RParen,
189                        ':' => Token::Colon,
190                        ',' => Token::Comma,
191
192                        // Sigils
193                        '=' => {
194                            let neg = if self.peek() == Some('-') {
195                                self.advance();
196                                true
197                            } else {
198                                false
199                            };
200                            let s = self.read_while(|c| c.is_ascii_digit());
201                            let n: i64 = s
202                                .parse()
203                                .map_err(|_| NxsError::ParseError(format!("bad int: {s}")))?;
204                            Token::Int(if neg { -n } else { n })
205                        }
206                        '~' => {
207                            let neg = if self.peek() == Some('-') {
208                                self.advance();
209                                true
210                            } else {
211                                false
212                            };
213                            let s = self.read_while(|c| {
214                                c.is_ascii_digit()
215                                    || c == '.'
216                                    || c == 'e'
217                                    || c == 'E'
218                                    || c == '+'
219                                    || c == '-'
220                            });
221                            let f: f64 = s
222                                .parse()
223                                .map_err(|_| NxsError::ParseError(format!("bad float: {s}")))?;
224                            Token::Float(if neg { -f } else { f })
225                        }
226                        '?' => {
227                            let s = self.read_while(|c| c.is_alphabetic());
228                            match s.as_str() {
229                                "true" => Token::Bool(true),
230                                "false" => Token::Bool(false),
231                                _ => return Err(NxsError::ParseError(format!("bad bool: {s}"))),
232                            }
233                        }
234                        '$' => {
235                            let s = self.read_while(|c| c.is_alphanumeric() || c == '_');
236                            Token::Keyword(s)
237                        }
238                        '"' => Token::Str(self.read_string()?),
239                        '@' => {
240                            // peek: if digit, it's a timestamp; otherwise it's a macro ref (handled in parser)
241                            if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
242                                // parse ISO-8601 date as nanoseconds: YYYY-MM-DD or full RFC3339
243                                let s = self.read_while(|c| {
244                                    !c.is_whitespace() && c != ',' && c != '}' && c != ']'
245                                });
246                                let ns = parse_temporal(&s)?;
247                                Token::Time(ns)
248                            } else {
249                                // macro ref — return @ + ident as a raw string for the macro parser
250                                let ident = self.read_while(|c| c.is_alphanumeric() || c == '_');
251                                Token::Macro(format!("@{ident}"))
252                            }
253                        }
254                        '<' => Token::Binary(self.read_binary()?),
255                        '&' => {
256                            let neg = if self.peek() == Some('-') {
257                                self.advance();
258                                true
259                            } else {
260                                false
261                            };
262                            let s = self.read_while(|c| c.is_ascii_digit());
263                            let n: i32 = s.parse().map_err(|_| {
264                                NxsError::ParseError(format!("bad link offset: {s}"))
265                            })?;
266                            Token::Link(if neg { -n } else { n })
267                        }
268                        '!' => Token::Macro(self.read_macro_expr()),
269                        '^' => Token::Null,
270
271                        // Identifier (key name)
272                        c if c.is_alphabetic() || c == '_' => {
273                            let mut s = c.to_string();
274                            s.push_str(
275                                &self.read_while(|c| c.is_alphanumeric() || c == '_' || c == '-'),
276                            );
277                            Token::Ident(s)
278                        }
279
280                        other => return Err(NxsError::UnknownSigil(other)),
281                    };
282                    tokens.push(tok);
283                }
284            }
285        }
286        Ok(tokens)
287    }
288}
289
290fn parse_temporal(s: &str) -> Result<i64> {
291    // Support YYYY-MM-DD
292    if s.len() == 10 && s.chars().nth(4) == Some('-') {
293        let year: i64 = s[0..4]
294            .parse()
295            .map_err(|_| NxsError::ParseError(format!("bad date: {s}")))?;
296        let month: i64 = s[5..7]
297            .parse()
298            .map_err(|_| NxsError::ParseError(format!("bad date: {s}")))?;
299        let day: i64 = s[8..10]
300            .parse()
301            .map_err(|_| NxsError::ParseError(format!("bad date: {s}")))?;
302        // Days since epoch (very simplified, good enough for POC)
303        let days = days_since_epoch(year, month, day);
304        return days
305            .checked_mul(86_400_000_000_000i64)
306            .ok_or_else(|| NxsError::ParseError(format!("temporal overflow: {s}")))
307            .map(Some)
308            .map(|v| v.unwrap());
309    }
310    // Support YYYY-MM-DDTHH:MM:SS[.fraction] without timezone.
311    if s.len() >= 19 && s.as_bytes().get(4) == Some(&b'-') && s.as_bytes().get(10) == Some(&b'T') {
312        let date_ns = parse_temporal(&s[..10])?;
313        let hour: i64 = s[11..13]
314            .parse()
315            .map_err(|_| NxsError::ParseError(format!("bad temporal: {s}")))?;
316        let minute: i64 = s[14..16]
317            .parse()
318            .map_err(|_| NxsError::ParseError(format!("bad temporal: {s}")))?;
319        let second: i64 = s[17..19]
320            .parse()
321            .map_err(|_| NxsError::ParseError(format!("bad temporal: {s}")))?;
322        if hour > 23 || minute > 59 || second > 59 {
323            return Err(NxsError::ParseError(format!("bad temporal: {s}")));
324        }
325        let frac_ns = if let Some(frac) = s.get(19..).and_then(|rest| rest.strip_prefix('.')) {
326            if frac.is_empty() || frac.len() > 9 || !frac.bytes().all(|b| b.is_ascii_digit()) {
327                return Err(NxsError::ParseError(format!("bad temporal: {s}")));
328            }
329            let mut padded = frac.to_string();
330            while padded.len() < 9 {
331                padded.push('0');
332            }
333            padded
334                .parse::<i64>()
335                .map_err(|_| NxsError::ParseError(format!("bad temporal: {s}")))?
336        } else if s.len() == 19 {
337            0
338        } else {
339            return Err(NxsError::ParseError(format!("bad temporal: {s}")));
340        };
341        return date_ns
342            .checked_add(hour * 3_600_000_000_000)
343            .and_then(|v| v.checked_add(minute * 60_000_000_000))
344            .and_then(|v| v.checked_add(second * 1_000_000_000))
345            .and_then(|v| v.checked_add(frac_ns))
346            .ok_or_else(|| NxsError::ParseError(format!("temporal overflow: {s}")));
347    }
348    // Support raw nanosecond integer
349    s.parse::<i64>()
350        .map_err(|_| NxsError::ParseError(format!("bad temporal: {s}")))
351}
352
353fn days_since_epoch(year: i64, month: i64, day: i64) -> i64 {
354    // Julian Day Number → days since Unix epoch (1970-01-01)
355    let a = (14 - month) / 12;
356    let y = year + 4800 - a;
357    let m = month + 12 * a - 3;
358    let jdn = day + (153 * m + 2) / 5 + 365 * y + y / 4 - y / 100 + y / 400 - 32045;
359    jdn - 2_440_588 // JDN of 1970-01-01
360}