Skip to main content

tealeaf/
lexer.rs

1//! Lexer for TeaLeaf text format
2
3use crate::{Error, Result};
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum TokenKind {
7    // Literals
8    Word(String),
9    String(String),
10    Bytes(Vec<u8>),
11    Int(i64),
12    UInt(u64),
13    Float(f64),
14    Bool(bool),
15    Null,
16    Timestamp(i64, i16),  // Unix milliseconds, timezone offset in minutes
17    JsonNumber(String),  // Arbitrary-precision number (raw decimal string)
18
19    // Punctuation
20    LBrace,
21    RBrace,
22    LBracket,
23    RBracket,
24    LParen,
25    RParen,
26    Colon,
27    Comma,
28    Eq,
29    Question,  // For nullable types (e.g., string?)
30
31    // Special
32    Directive(String),
33    Tag(String),
34    Ref(String),
35
36    Eof,
37}
38
39#[derive(Debug, Clone)]
40pub struct Token {
41    pub kind: TokenKind,
42    pub line: usize,
43    pub col: usize,
44}
45
46impl Token {
47    pub fn new(kind: TokenKind, line: usize, col: usize) -> Self {
48        Self { kind, line, col }
49    }
50}
51
52pub struct Lexer<'a> {
53    input: &'a str,
54    pos: usize,
55    line: usize,
56    col: usize,
57}
58
59impl<'a> Lexer<'a> {
60    pub fn new(input: &'a str) -> Self {
61        Self {
62            input,
63            pos: 0,
64            line: 1,
65            col: 1,
66        }
67    }
68
69    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
70        let mut tokens = Vec::new();
71        loop {
72            let tok = self.next_token()?;
73            let is_eof = matches!(tok.kind, TokenKind::Eof);
74            tokens.push(tok);
75            if is_eof {
76                break;
77            }
78        }
79        Ok(tokens)
80    }
81
82    fn next_token(&mut self) -> Result<Token> {
83        loop {
84            self.skip_whitespace_and_comments();
85
86            let line = self.line;
87            let col = self.col;
88
89            if self.pos >= self.input.len() {
90                return Ok(Token::new(TokenKind::Eof, line, col));
91            }
92
93            let c = match self.current_char() {
94                Some(c) => c,
95                None => return Ok(Token::new(TokenKind::Eof, line, col)),
96            };
97
98            // Simple single-char tokens
99            let simple = match c {
100                '{' => Some(TokenKind::LBrace),
101                '}' => Some(TokenKind::RBrace),
102                '[' => Some(TokenKind::LBracket),
103                ']' => Some(TokenKind::RBracket),
104                '(' => Some(TokenKind::LParen),
105                ')' => Some(TokenKind::RParen),
106                ',' => Some(TokenKind::Comma),
107                '=' => Some(TokenKind::Eq),
108                '~' => Some(TokenKind::Null),
109                '?' => Some(TokenKind::Question),
110                _ => None,
111            };
112
113            if let Some(kind) = simple {
114                self.advance();
115                return Ok(Token::new(kind, line, col));
116            }
117
118            // Colon - might be a tag
119            if c == ':' {
120                self.advance();
121                if self.current_char().map(|c| c.is_alphabetic() || c == '_').unwrap_or(false) {
122                    let word = self.read_word();
123                    return Ok(Token::new(TokenKind::Tag(word), line, col));
124                }
125                return Ok(Token::new(TokenKind::Colon, line, col));
126            }
127
128            // Directive
129            if c == '@' {
130                self.advance();
131                let word = self.read_word();
132                return Ok(Token::new(TokenKind::Directive(word), line, col));
133            }
134
135            // Reference
136            if c == '!' {
137                self.advance();
138                let word = self.read_word();
139                return Ok(Token::new(TokenKind::Ref(word), line, col));
140            }
141
142            // Bytes literal: b"hex..."
143            if c == 'b' && self.peek_char(1) == Some('"') {
144                return self.read_bytes_literal(line, col);
145            }
146
147            // String
148            if c == '"' {
149                return self.read_string(line, col);
150            }
151
152            // Timestamp (must check before number - pattern: YYYY-MM-DD...)
153            // Validate full date pattern with ASCII digits to prevent
154            // parse_iso8601 from slicing into multi-byte characters.
155            // Strictly 4-digit years per spec: date = digit{4} "-" digit{2} "-" digit{2}
156            if c.is_ascii_digit() {
157                let remaining = self.input[self.pos..].as_bytes();
158                if remaining.len() >= 10
159                   && remaining[0].is_ascii_digit()
160                   && remaining[1].is_ascii_digit()
161                   && remaining[2].is_ascii_digit()
162                   && remaining[3].is_ascii_digit()
163                   && remaining[4] == b'-'
164                   && remaining[5].is_ascii_digit()
165                   && remaining[6].is_ascii_digit()
166                   && remaining[7] == b'-'
167                   && remaining[8].is_ascii_digit()
168                   && remaining[9].is_ascii_digit()
169                {
170                    return self.read_timestamp(line, col);
171                }
172            }
173
174            // Negative infinity: -inf
175            if c == '-' && self.input[self.pos..].starts_with("-inf") {
176                // Make sure it's not a prefix of a longer word like "-info"
177                let after = self.input.get(self.pos + 4..self.pos + 5)
178                    .and_then(|s| s.chars().next());
179                if after.map_or(true, |c| !c.is_alphanumeric() && c != '_') {
180                    self.pos += 4;
181                    self.col += 4;
182                    return Ok(Token::new(TokenKind::Float(f64::NEG_INFINITY), line, col));
183                }
184            }
185
186            // Number
187            if c.is_ascii_digit() || (c == '-' && self.peek_char(1).map(|c| c.is_ascii_digit()).unwrap_or(false)) {
188                return self.read_number(line, col);
189            }
190
191            // Word or keyword
192            if c.is_alphabetic() || c == '_' {
193                let word = self.read_word();
194                let kind = match word.as_str() {
195                    "true" => TokenKind::Bool(true),
196                    "false" => TokenKind::Bool(false),
197                    "NaN" => TokenKind::Float(f64::NAN),
198                    "inf" => TokenKind::Float(f64::INFINITY),
199                    _ => TokenKind::Word(word),
200                };
201                return Ok(Token::new(kind, line, col));
202            }
203
204            // Skip unknown character and loop to try next
205            self.advance();
206        }
207    }
208
209    fn current_char(&self) -> Option<char> {
210        self.input[self.pos..].chars().next()
211    }
212
213    fn peek_char(&self, offset: usize) -> Option<char> {
214        self.input[self.pos..].chars().nth(offset)
215    }
216
217    fn advance(&mut self) {
218        if let Some(c) = self.current_char() {
219            self.pos += c.len_utf8();
220            if c == '\n' {
221                self.line += 1;
222                self.col = 1;
223            } else {
224                self.col += 1;
225            }
226        }
227    }
228
229    fn skip_whitespace_and_comments(&mut self) {
230        while let Some(c) = self.current_char() {
231            if c.is_whitespace() {
232                self.advance();
233            } else if c == '#' {
234                // Skip comment to end of line
235                while let Some(c) = self.current_char() {
236                    if c == '\n' {
237                        break;
238                    }
239                    self.advance();
240                }
241            } else {
242                break;
243            }
244        }
245    }
246
247    fn read_word(&mut self) -> String {
248        let start = self.pos;
249        while let Some(c) = self.current_char() {
250            if c.is_alphanumeric() || c == '_' || c == '-' || c == '.' {
251                self.advance();
252            } else {
253                break;
254            }
255        }
256        self.input[start..self.pos].to_string()
257    }
258
259    fn read_string(&mut self, line: usize, col: usize) -> Result<Token> {
260        self.advance(); // Skip opening quote
261
262        // Check for multiline
263        if self.input[self.pos..].starts_with("\"\"") {
264            self.advance();
265            self.advance();
266            return self.read_multiline_string(line, col);
267        }
268
269        let mut value = String::new();
270        while let Some(c) = self.current_char() {
271            if c == '"' {
272                self.advance();
273                return Ok(Token::new(TokenKind::String(value), line, col));
274            } else if c == '\\' {
275                self.advance();
276                if let Some(escaped) = self.current_char() {
277                    match escaped {
278                        'n' => { value.push('\n'); self.advance(); }
279                        't' => { value.push('\t'); self.advance(); }
280                        'r' => { value.push('\r'); self.advance(); }
281                        'b' => { value.push('\u{0008}'); self.advance(); }
282                        'f' => { value.push('\u{000C}'); self.advance(); }
283                        '"' => { value.push('"'); self.advance(); }
284                        '\\' => { value.push('\\'); self.advance(); }
285                        'u' => {
286                            self.advance(); // skip 'u'
287                            let start = self.pos;
288                            let mut count = 0;
289                            while count < 4 {
290                                match self.current_char() {
291                                    Some(c) if c.is_ascii_hexdigit() => {
292                                        self.advance();
293                                        count += 1;
294                                    }
295                                    _ => break,
296                                }
297                            }
298                            if count != 4 {
299                                return Err(Error::ParseError(
300                                    "Invalid unicode escape: expected 4 hex digits after \\u".to_string()
301                                ));
302                            }
303                            let hex = &self.input[start..self.pos];
304                            let code = u32::from_str_radix(hex, 16).map_err(|_| {
305                                Error::ParseError(format!("Invalid unicode escape: \\u{}", hex))
306                            })?;
307                            let ch = char::from_u32(code).ok_or_else(|| {
308                                Error::ParseError(format!("Invalid unicode codepoint: U+{:04X}", code))
309                            })?;
310                            value.push(ch);
311                        }
312                        _ => {
313                            return Err(Error::ParseError(
314                                format!("Invalid escape sequence: \\{}", escaped)
315                            ));
316                        }
317                    }
318                }
319            } else {
320                value.push(c);
321                self.advance();
322            }
323        }
324        Err(Error::ParseError("Unterminated string".to_string()))
325    }
326
327    fn read_bytes_literal(&mut self, line: usize, col: usize) -> Result<Token> {
328        self.advance(); // skip 'b'
329        self.advance(); // skip '"'
330
331        let mut hex = String::new();
332        while let Some(c) = self.current_char() {
333            if c == '"' {
334                self.advance();
335                if hex.len() % 2 != 0 {
336                    return Err(Error::ParseError(
337                        format!("Bytes literal has odd number of hex digits ({})", hex.len())
338                    ));
339                }
340                let bytes = (0..hex.len())
341                    .step_by(2)
342                    .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).map_err(|_|
343                        Error::ParseError(format!("Invalid hex pair '{}' in bytes literal", &hex[i..i + 2]))
344                    ))
345                    .collect::<Result<Vec<u8>>>()?;
346                return Ok(Token::new(TokenKind::Bytes(bytes), line, col));
347            } else if c.is_ascii_hexdigit() {
348                hex.push(c);
349                self.advance();
350            } else {
351                return Err(Error::ParseError(
352                    format!("Invalid character '{}' in bytes literal (expected hex digit or '\"')", c)
353                ));
354            }
355        }
356        Err(Error::ParseError("Unterminated bytes literal".to_string()))
357    }
358
359    fn read_multiline_string(&mut self, line: usize, col: usize) -> Result<Token> {
360        let start = self.pos;
361        while self.pos < self.input.len() {
362            if self.input[self.pos..].starts_with("\"\"\"") {
363                let raw = &self.input[start..self.pos];
364                self.advance();
365                self.advance();
366                self.advance();
367
368                // Dedent
369                let lines: Vec<&str> = raw.lines().collect();
370                let lines: Vec<&str> = if lines.len() > 1 && lines.first().map(|l| l.trim().is_empty()).unwrap_or(false) {
371                    lines[1..].to_vec()
372                } else {
373                    lines
374                };
375                let lines: Vec<&str> = if lines.len() > 1 && lines.last().map(|l| l.trim().is_empty()).unwrap_or(false) {
376                    lines[..lines.len() - 1].to_vec()
377                } else {
378                    lines
379                };
380
381                // Count indent in characters (not bytes) to safely handle
382                // multi-byte whitespace like U+0085 (NEXT LINE, 2 bytes).
383                let min_indent = lines
384                    .iter()
385                    .filter(|l| !l.trim().is_empty())
386                    .map(|l| l.chars().take_while(|c| c.is_whitespace()).count())
387                    .min()
388                    .unwrap_or(0);
389
390                let dedented: Vec<&str> = lines
391                    .iter()
392                    .map(|l| {
393                        // Find the byte offset after skipping min_indent characters
394                        let byte_off: usize = l.chars().take(min_indent).map(|c| c.len_utf8()).sum();
395                        if byte_off <= l.len() { &l[byte_off..] } else { *l }
396                    })
397                    .collect();
398
399                return Ok(Token::new(TokenKind::String(dedented.join("\n")), line, col));
400            }
401            self.advance();
402        }
403        Err(Error::ParseError("Unterminated multiline string".to_string()))
404    }
405
406    fn read_timestamp(&mut self, line: usize, col: usize) -> Result<Token> {
407        let start = self.pos;
408
409        // Read YYYY-MM-DD (exactly 10 characters)
410        for _ in 0..10 {
411            self.advance();
412        }
413
414        // Check for time part: THH:MM:SS
415        if self.current_char() == Some('T') {
416            self.advance();
417            // Read HH:MM:SS
418            while let Some(c) = self.current_char() {
419                if c.is_ascii_digit() || c == ':' {
420                    self.advance();
421                } else {
422                    break;
423                }
424            }
425            // Optional milliseconds .sss
426            if self.current_char() == Some('.') {
427                self.advance();
428                while let Some(c) = self.current_char() {
429                    if c.is_ascii_digit() {
430                        self.advance();
431                    } else {
432                        break;
433                    }
434                }
435            }
436            // Timezone: Z or +HH:MM or -HH:MM
437            if self.current_char() == Some('Z') {
438                self.advance();
439            } else if self.current_char() == Some('+') || self.current_char() == Some('-') {
440                self.advance();
441                // Read HH:MM
442                while let Some(c) = self.current_char() {
443                    if c.is_ascii_digit() || c == ':' {
444                        self.advance();
445                    } else {
446                        break;
447                    }
448                }
449            }
450        }
451
452        let timestamp_str = &self.input[start..self.pos];
453        let (millis, tz_offset) = parse_iso8601(timestamp_str)
454            .map_err(|_| Error::ParseError(format!("Invalid timestamp: {}", timestamp_str)))?;
455
456        Ok(Token::new(TokenKind::Timestamp(millis, tz_offset), line, col))
457    }
458
459    fn read_number(&mut self, line: usize, col: usize) -> Result<Token> {
460        let start = self.pos;
461
462        // Handle negative
463        if self.current_char() == Some('-') {
464            self.advance();
465        }
466
467        // Hex
468        if self.input[self.pos..].starts_with("0x") || self.input[self.pos..].starts_with("0X") {
469            self.advance();
470            self.advance();
471            while let Some(c) = self.current_char() {
472                if c.is_ascii_hexdigit() {
473                    self.advance();
474                } else {
475                    break;
476                }
477            }
478            let s = &self.input[start..self.pos];
479            let val = if s.starts_with('-') {
480                -(i64::from_str_radix(&s[3..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?)
481            } else {
482                i64::from_str_radix(&s[2..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?
483            };
484            return Ok(Token::new(TokenKind::Int(val), line, col));
485        }
486
487        // Binary
488        if self.input[self.pos..].starts_with("0b") || self.input[self.pos..].starts_with("0B") {
489            self.advance();
490            self.advance();
491            while let Some(c) = self.current_char() {
492                if c == '0' || c == '1' {
493                    self.advance();
494                } else {
495                    break;
496                }
497            }
498            let s = &self.input[start..self.pos];
499            let val = if s.starts_with('-') {
500                -(i64::from_str_radix(&s[3..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?)
501            } else {
502                i64::from_str_radix(&s[2..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?
503            };
504            return Ok(Token::new(TokenKind::Int(val), line, col));
505        }
506
507        // Regular number
508        let mut has_dot = false;
509        let mut has_exp = false;
510        while let Some(c) = self.current_char() {
511            if c.is_ascii_digit() {
512                self.advance();
513            } else if c == '.' && !has_dot && !has_exp {
514                has_dot = true;
515                self.advance();
516            } else if (c == 'e' || c == 'E') && !has_exp {
517                has_exp = true;
518                self.advance();
519                if self.current_char() == Some('+') || self.current_char() == Some('-') {
520                    self.advance();
521                }
522            } else {
523                break;
524            }
525        }
526
527        let s = &self.input[start..self.pos];
528        if has_dot || has_exp {
529            let val: f64 = s.parse().map_err(|_| Error::ParseError(format!("Invalid float: {}", s)))?;
530            if val.is_finite() {
531                Ok(Token::new(TokenKind::Float(val), line, col))
532            } else {
533                Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col))
534            }
535        } else {
536            // Try i64 first, then u64, then preserve as JsonNumber
537            match s.parse::<i64>() {
538                Ok(val) => Ok(Token::new(TokenKind::Int(val), line, col)),
539                Err(_) => match s.parse::<u64>() {
540                    Ok(val) => Ok(Token::new(TokenKind::UInt(val), line, col)),
541                    Err(_) => Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col)),
542                }
543            }
544        }
545    }
546}
547
548/// Parse an ISO 8601 timestamp string to Unix milliseconds and timezone offset.
549/// Strictly 4-digit years per spec: YYYY-MM-DD[THH:MM[:SS[.sss]][Z|+HH:MM|-HH:MM]]
550/// Returns (unix_millis, tz_offset_minutes).
551fn parse_iso8601(s: &str) -> std::result::Result<(i64, i16), ()> {
552    // Safety: reject any non-ASCII input up front so that byte-position
553    // slicing cannot split multi-byte characters.
554    if !s.is_ascii() {
555        return Err(());
556    }
557
558    if s.len() < 10 {
559        return Err(());
560    }
561
562    let year: i64 = s[0..4].parse().map_err(|_| ())?;
563    let month: u32 = s[5..7].parse().map_err(|_| ())?;
564    let day: u32 = s[8..10].parse().map_err(|_| ())?;
565    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
566        return Err(());
567    }
568
569    let time_start = 10;
570    let (hour, minute, second, millis, tz_offset_minutes) = if s.len() > time_start && s.as_bytes()[time_start] == b'T' {
571        let time_part = &s[time_start + 1..];
572        let hour: u32 = time_part.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
573        let minute: u32 = time_part.get(3..5).ok_or(())?.parse().map_err(|_| ())?;
574
575        // Determine whether seconds are present or timezone follows directly.
576        // After HH:MM (positions 0-4), position 5 tells us:
577        //   ':' → seconds at 6..8, rest starts at 8
578        //   '+'/'-'/'Z' → no seconds, timezone starts at 5
579        //   end of string → no seconds, no timezone
580        let (second, rest_start) = if time_part.len() > 5 {
581            match time_part.as_bytes()[5] {
582                b':' => {
583                    let sec: u32 = time_part.get(6..8).ok_or(())?.parse().map_err(|_| ())?;
584                    (sec, 8usize)
585                }
586                b'+' | b'-' | b'Z' => (0u32, 5usize),
587                _ => (0u32, time_part.len()),
588            }
589        } else {
590            (0u32, time_part.len())
591        };
592
593        // Validate time component ranges
594        if hour > 23 || minute > 59 || second > 59 {
595            return Err(());
596        }
597
598        let mut millis = 0i64;
599        let mut rest = &time_part[rest_start.min(time_part.len())..];
600
601        // Parse milliseconds (only first 3 fractional digits matter)
602        if rest.starts_with('.') && rest.len() > 1 {
603            let end = rest[1..].find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len() - 1);
604            if end == 0 {
605                return Err(());
606            }
607            // Cap to 3 digits — we only need millisecond precision and
608            // longer strings can overflow i64::pow (e.g. 22 digits → 10^19).
609            let frac_digits = end.min(3);
610            let ms_str = &rest[1..1 + frac_digits];
611            millis = ms_str.parse::<i64>().unwrap_or(0);
612            let digits = ms_str.len();
613            if digits < 3 {
614                millis *= 10i64.pow(3 - digits as u32);
615            }
616            rest = &rest[end + 1..];
617        } else if rest.starts_with('.') {
618            // Just a trailing dot with no digits — skip it
619            rest = &rest[1..];
620        }
621
622        // Parse timezone
623        let tz_offset = if rest.starts_with('Z') {
624            0i32
625        } else if rest.starts_with('+') || rest.starts_with('-') {
626            let sign: i32 = if rest.starts_with('+') { 1 } else { -1 };
627            let tz = &rest[1..];
628            let tz_hour: i32 = tz.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
629            // Accept +HH:MM, +HHMM, or +HH (minutes default to 00)
630            let tz_min: i32 = if tz.len() >= 4 && tz.as_bytes()[2] == b':' {
631                tz.get(3..5).unwrap_or("00").parse().unwrap_or(0)   // +HH:MM
632            } else if tz.len() >= 4 && tz.as_bytes()[2] != b':' {
633                tz.get(2..4).unwrap_or("00").parse().unwrap_or(0)   // +HHMM
634            } else {
635                0                                                     // +HH
636            };
637            if tz_hour > 23 || tz_min > 59 {
638                return Err(());
639            }
640            sign * (tz_hour * 60 + tz_min)
641        } else {
642            0 // Assume UTC if no timezone
643        };
644
645        (hour, minute, second, millis, tz_offset)
646    } else {
647        (0, 0, 0, 0, 0)
648    };
649
650    // Calculate Unix timestamp
651    // Days from epoch (1970-01-01)
652    let days = days_from_epoch(year, month, day);
653    let seconds = days * 86400
654        + hour as i64 * 3600
655        + minute as i64 * 60
656        + second as i64
657        - tz_offset_minutes as i64 * 60;
658
659    Ok((seconds * 1000 + millis, tz_offset_minutes as i16))
660}
661
662/// Calculate days from Unix epoch (1970-01-01)
663fn days_from_epoch(year: i64, month: u32, day: u32) -> i64 {
664    let y = if month <= 2 { year - 1 } else { year };
665    let m = if month <= 2 { month + 12 } else { month };
666    let era = if y >= 0 { y } else { y - 399 } / 400;
667    let yoe = (y - era * 400) as u32;
668    let doy = (153 * (m - 3) + 2) / 5 + day - 1;
669    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
670    era * 146097 + doe as i64 - 719468
671}
672
673#[cfg(test)]
674mod tests {
675    use super::*;
676
677    #[test]
678    fn test_simple_tokens() {
679        let mut lexer = Lexer::new("{ } [ ] ( ) : , ~");
680        let tokens = lexer.tokenize().unwrap();
681        assert!(matches!(tokens[0].kind, TokenKind::LBrace));
682        assert!(matches!(tokens[1].kind, TokenKind::RBrace));
683        assert!(matches!(tokens[2].kind, TokenKind::LBracket));
684        assert!(matches!(tokens[8].kind, TokenKind::Null));
685    }
686
687    #[test]
688    fn test_numbers() {
689        let mut lexer = Lexer::new("42 -17 3.14 0xFF 0b1010");
690        let tokens = lexer.tokenize().unwrap();
691        assert!(matches!(tokens[0].kind, TokenKind::Int(42)));
692        assert!(matches!(tokens[1].kind, TokenKind::Int(-17)));
693        assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 3.14).abs() < 0.001));
694        assert!(matches!(tokens[3].kind, TokenKind::Int(255)));
695        assert!(matches!(tokens[4].kind, TokenKind::Int(10)));
696    }
697
698    #[test]
699    fn test_strings() {
700        let mut lexer = Lexer::new(r#""hello" "world\n""#);
701        let tokens = lexer.tokenize().unwrap();
702        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
703        assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world\n"));
704    }
705
706    #[test]
707    fn test_directives() {
708        let mut lexer = Lexer::new("@struct @table");
709        let tokens = lexer.tokenize().unwrap();
710        assert!(matches!(&tokens[0].kind, TokenKind::Directive(s) if s == "struct"));
711        assert!(matches!(&tokens[1].kind, TokenKind::Directive(s) if s == "table"));
712    }
713
714    #[test]
715    fn test_references() {
716        let mut lexer = Lexer::new("!myref !another_ref");
717        let tokens = lexer.tokenize().unwrap();
718        assert!(matches!(&tokens[0].kind, TokenKind::Ref(s) if s == "myref"));
719        assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "another_ref"));
720    }
721
722    #[test]
723    fn test_comments_and_references() {
724        // # is always a comment
725        let mut lexer = Lexer::new("value1 # this is a comment\nvalue2");
726        let tokens = lexer.tokenize().unwrap();
727        assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
728        assert!(matches!(&tokens[1].kind, TokenKind::Word(s) if s == "value2"));
729        assert!(matches!(tokens[2].kind, TokenKind::Eof));
730
731        // ! is a reference
732        let mut lexer = Lexer::new("value1 !ref value2");
733        let tokens = lexer.tokenize().unwrap();
734        assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
735        assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "ref"));
736        assert!(matches!(&tokens[2].kind, TokenKind::Word(s) if s == "value2"));
737    }
738
739    // -------------------------------------------------------------------------
740    // String escape sequences
741    // -------------------------------------------------------------------------
742
743    #[test]
744    fn test_string_escape_tab() {
745        let mut lexer = Lexer::new(r#""\t""#);
746        let tokens = lexer.tokenize().unwrap();
747        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\t"));
748    }
749
750    #[test]
751    fn test_string_escape_cr() {
752        let mut lexer = Lexer::new(r#""\r""#);
753        let tokens = lexer.tokenize().unwrap();
754        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\r"));
755    }
756
757    #[test]
758    fn test_string_escape_backspace() {
759        let mut lexer = Lexer::new(r#""\b""#);
760        let tokens = lexer.tokenize().unwrap();
761        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{0008}"));
762    }
763
764    #[test]
765    fn test_string_escape_formfeed() {
766        let mut lexer = Lexer::new(r#""\f""#);
767        let tokens = lexer.tokenize().unwrap();
768        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{000C}"));
769    }
770
771    #[test]
772    fn test_string_escape_backslash() {
773        let mut lexer = Lexer::new(r#""\\""#);
774        let tokens = lexer.tokenize().unwrap();
775        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\\"));
776    }
777
778    #[test]
779    fn test_string_escape_quote() {
780        let mut lexer = Lexer::new(r#""\"hello\"""#);
781        let tokens = lexer.tokenize().unwrap();
782        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\"hello\""));
783    }
784
785    #[test]
786    fn test_string_escape_unicode() {
787        let mut lexer = Lexer::new(r#""\u0041""#);
788        let tokens = lexer.tokenize().unwrap();
789        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "A"));
790    }
791
792    #[test]
793    fn test_string_escape_unicode_emoji_range() {
794        // Heart suit: U+2665
795        let mut lexer = Lexer::new(r#""\u2665""#);
796        let tokens = lexer.tokenize().unwrap();
797        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{2665}"));
798    }
799
800    #[test]
801    fn test_string_invalid_escape() {
802        let mut lexer = Lexer::new(r#""\x""#);
803        let err = lexer.tokenize().unwrap_err();
804        assert!(err.to_string().contains("Invalid escape sequence"));
805    }
806
807    #[test]
808    fn test_string_invalid_unicode_short() {
809        let mut lexer = Lexer::new(r#""\u00""#);
810        let err = lexer.tokenize().unwrap_err();
811        assert!(err.to_string().contains("Invalid unicode escape"));
812    }
813
814    #[test]
815    fn test_unterminated_string() {
816        let mut lexer = Lexer::new(r#""hello"#);
817        let err = lexer.tokenize().unwrap_err();
818        assert!(err.to_string().contains("Unterminated string"));
819    }
820
821    // -------------------------------------------------------------------------
822    // Multiline strings
823    // -------------------------------------------------------------------------
824
825    #[test]
826    fn test_multiline_string() {
827        let input = "\"\"\"
828    hello
829    world
830\"\"\"";
831        let mut lexer = Lexer::new(input);
832        let tokens = lexer.tokenize().unwrap();
833        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s.contains("hello") && s.contains("world")));
834    }
835
836    #[test]
837    fn test_unterminated_multiline_string() {
838        let input = "\"\"\"
839    hello world";
840        let mut lexer = Lexer::new(input);
841        let err = lexer.tokenize().unwrap_err();
842        assert!(err.to_string().contains("Unterminated multiline string"));
843    }
844
845    // -------------------------------------------------------------------------
846    // Timestamps
847    // -------------------------------------------------------------------------
848
849    #[test]
850    fn test_timestamp_basic() {
851        let mut lexer = Lexer::new("2024-01-15T10:30:00Z");
852        let tokens = lexer.tokenize().unwrap();
853        match &tokens[0].kind {
854            TokenKind::Timestamp(ts, _tz) => {
855                // 2024-01-15T10:30:00Z should be a valid timestamp
856                assert!(*ts > 0);
857            }
858            other => panic!("Expected Timestamp, got {:?}", other),
859        }
860    }
861
862    #[test]
863    fn test_timestamp_with_millis() {
864        let mut lexer = Lexer::new("2024-01-15T10:30:00.123Z");
865        let tokens = lexer.tokenize().unwrap();
866        match &tokens[0].kind {
867            TokenKind::Timestamp(ts, _tz) => {
868                assert_eq!(*ts % 1000, 123); // milliseconds preserved
869            }
870            other => panic!("Expected Timestamp, got {:?}", other),
871        }
872    }
873
874    #[test]
875    fn test_timestamp_date_only() {
876        let mut lexer = Lexer::new("2024-01-15");
877        let tokens = lexer.tokenize().unwrap();
878        assert!(matches!(tokens[0].kind, TokenKind::Timestamp(_, _)));
879    }
880
881    #[test]
882    fn test_timestamp_with_offset() {
883        let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
884        let tokens = lexer.tokenize().unwrap();
885        if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
886        else { panic!("expected timestamp"); }
887    }
888
889    #[test]
890    fn test_timestamp_with_negative_offset() {
891        let mut lexer = Lexer::new("2024-01-15T10:30:00-08:00");
892        let tokens = lexer.tokenize().unwrap();
893        if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, -480); }
894        else { panic!("expected timestamp"); }
895    }
896
897    #[test]
898    fn test_timestamp_offset_formats() {
899        // +HH:MM (standard)
900        let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
901        let tokens = lexer.tokenize().unwrap();
902        if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
903        else { panic!("expected timestamp"); }
904
905        // +HHMM (compact, no colon)
906        let mut lexer = Lexer::new("2024-01-15T10:30:00+0530");
907        let tokens = lexer.tokenize().unwrap();
908        if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
909        else { panic!("expected timestamp for +HHMM"); }
910
911        // +HH (hour-only, minutes default to 00)
912        let mut lexer = Lexer::new("2024-01-15T10:30:00+05");
913        let tokens = lexer.tokenize().unwrap();
914        if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 300); }
915        else { panic!("expected timestamp for +HH"); }
916    }
917
918    // -------------------------------------------------------------------------
919    // Number edge cases
920    // -------------------------------------------------------------------------
921
922    #[test]
923    fn test_scientific_notation() {
924        let mut lexer = Lexer::new("1.5e10 2.3E-5 1e+3");
925        let tokens = lexer.tokenize().unwrap();
926        assert!(matches!(tokens[0].kind, TokenKind::Float(f) if (f - 1.5e10).abs() < 1.0));
927        assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - 2.3e-5).abs() < 1e-10));
928        assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 1e3).abs() < 1.0));
929    }
930
931    #[test]
932    fn test_binary_literal() {
933        let mut lexer = Lexer::new("0b1100 0B1010");
934        let tokens = lexer.tokenize().unwrap();
935        assert!(matches!(tokens[0].kind, TokenKind::Int(12)));
936        assert!(matches!(tokens[1].kind, TokenKind::Int(10)));
937    }
938
939    #[test]
940    fn test_hex_uppercase() {
941        let mut lexer = Lexer::new("0XDEAD");
942        let tokens = lexer.tokenize().unwrap();
943        assert!(matches!(tokens[0].kind, TokenKind::Int(0xDEAD)));
944    }
945
946    #[test]
947    fn test_negative_number() {
948        let mut lexer = Lexer::new("-42 -3.14");
949        let tokens = lexer.tokenize().unwrap();
950        assert!(matches!(tokens[0].kind, TokenKind::Int(-42)));
951        assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - (-3.14)).abs() < 0.001));
952    }
953
954    // -------------------------------------------------------------------------
955    // Tags and special tokens
956    // -------------------------------------------------------------------------
957
958    #[test]
959    fn test_tag_token() {
960        let mut lexer = Lexer::new(":Circle {radius: 5.0}");
961        let tokens = lexer.tokenize().unwrap();
962        assert!(matches!(&tokens[0].kind, TokenKind::Tag(s) if s == "Circle"));
963    }
964
965    #[test]
966    fn test_colon_without_word() {
967        let mut lexer = Lexer::new(": 5");
968        let tokens = lexer.tokenize().unwrap();
969        assert!(matches!(tokens[0].kind, TokenKind::Colon));
970    }
971
972    #[test]
973    fn test_question_mark() {
974        let mut lexer = Lexer::new("string?");
975        let tokens = lexer.tokenize().unwrap();
976        assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "string"));
977        assert!(matches!(tokens[1].kind, TokenKind::Question));
978    }
979
980    #[test]
981    fn test_equals_token() {
982        let mut lexer = Lexer::new("x = 5");
983        let tokens = lexer.tokenize().unwrap();
984        assert!(matches!(tokens[1].kind, TokenKind::Eq));
985    }
986
987    #[test]
988    fn test_bool_keywords() {
989        let mut lexer = Lexer::new("true false");
990        let tokens = lexer.tokenize().unwrap();
991        assert!(matches!(tokens[0].kind, TokenKind::Bool(true)));
992        assert!(matches!(tokens[1].kind, TokenKind::Bool(false)));
993    }
994
995    #[test]
996    fn test_empty_input() {
997        let mut lexer = Lexer::new("");
998        let tokens = lexer.tokenize().unwrap();
999        assert_eq!(tokens.len(), 1);
1000        assert!(matches!(tokens[0].kind, TokenKind::Eof));
1001    }
1002
1003    #[test]
1004    fn test_whitespace_only() {
1005        let mut lexer = Lexer::new("   \n\t  ");
1006        let tokens = lexer.tokenize().unwrap();
1007        assert_eq!(tokens.len(), 1);
1008        assert!(matches!(tokens[0].kind, TokenKind::Eof));
1009    }
1010
1011    #[test]
1012    fn test_token_positions() {
1013        let mut lexer = Lexer::new("hello: 42");
1014        let tokens = lexer.tokenize().unwrap();
1015        assert_eq!(tokens[0].line, 1);
1016        assert_eq!(tokens[0].col, 1);
1017    }
1018
1019    #[test]
1020    fn test_all_brackets() {
1021        let mut lexer = Lexer::new("() {} []");
1022        let tokens = lexer.tokenize().unwrap();
1023        assert!(matches!(tokens[0].kind, TokenKind::LParen));
1024        assert!(matches!(tokens[1].kind, TokenKind::RParen));
1025        assert!(matches!(tokens[2].kind, TokenKind::LBrace));
1026        assert!(matches!(tokens[3].kind, TokenKind::RBrace));
1027        assert!(matches!(tokens[4].kind, TokenKind::LBracket));
1028        assert!(matches!(tokens[5].kind, TokenKind::RBracket));
1029    }
1030
1031    // -------------------------------------------------------------------------
1032    // Bytes literals
1033    // -------------------------------------------------------------------------
1034
1035    #[test]
1036    fn test_bytes_literal_basic() {
1037        let mut lexer = Lexer::new(r#"b"48656c6c6f""#);
1038        let tokens = lexer.tokenize().unwrap();
1039        assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0x48, 0x65, 0x6c, 0x6c, 0x6f]));
1040    }
1041
1042    #[test]
1043    fn test_bytes_literal_empty() {
1044        let mut lexer = Lexer::new(r#"b"""#);
1045        let tokens = lexer.tokenize().unwrap();
1046        assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b.is_empty()));
1047    }
1048
1049    #[test]
1050    fn test_bytes_literal_uppercase() {
1051        let mut lexer = Lexer::new(r#"b"CAFEF00D""#);
1052        let tokens = lexer.tokenize().unwrap();
1053        assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe, 0xf0, 0x0d]));
1054    }
1055
1056    #[test]
1057    fn test_bytes_literal_mixed_case() {
1058        let mut lexer = Lexer::new(r#"b"CaFe""#);
1059        let tokens = lexer.tokenize().unwrap();
1060        assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe]));
1061    }
1062
1063    #[test]
1064    fn test_bytes_literal_odd_length_error() {
1065        let mut lexer = Lexer::new(r#"b"abc""#);
1066        let err = lexer.tokenize().unwrap_err();
1067        assert!(err.to_string().contains("odd number of hex digits"), "Error: {}", err);
1068    }
1069
1070    #[test]
1071    fn test_bytes_literal_invalid_char_error() {
1072        let mut lexer = Lexer::new(r#"b"xyz""#);
1073        let err = lexer.tokenize().unwrap_err();
1074        assert!(err.to_string().contains("Invalid character"), "Error: {}", err);
1075    }
1076
1077    #[test]
1078    fn test_bytes_literal_unterminated_error() {
1079        let mut lexer = Lexer::new(r#"b"cafe"#);
1080        let err = lexer.tokenize().unwrap_err();
1081        assert!(err.to_string().contains("Unterminated bytes literal"), "Error: {}", err);
1082    }
1083
1084    #[test]
1085    fn test_bytes_literal_does_not_conflict_with_word() {
1086        // "bar" should parse as a word, not a bytes literal
1087        let mut lexer = Lexer::new("bar baz");
1088        let tokens = lexer.tokenize().unwrap();
1089        assert!(matches!(&tokens[0].kind, TokenKind::Word(w) if w == "bar"));
1090        assert!(matches!(&tokens[1].kind, TokenKind::Word(w) if w == "baz"));
1091    }
1092
1093    // -------------------------------------------------------------------------
1094    // Fuzz regression tests
1095    // -------------------------------------------------------------------------
1096
1097    #[test]
1098    fn test_fuzz_crash_unknown_chars_no_stack_overflow() {
1099        // Regression: fuzz_parse crash-e42e7ae2f5127519e7e60e87d1cbfbc2a5bf878d
1100        // Many consecutive unknown Unicode characters caused stack overflow
1101        // via recursive next_token() calls.
1102        let input = "\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{3}#\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{07FE}";
1103        let mut lexer = Lexer::new(input);
1104        // Should not stack overflow — may return Ok or Err, but must not crash
1105        let _ = lexer.tokenize();
1106    }
1107
1108    #[test]
1109    fn test_fuzz_crash_timestamp_non_ascii_date() {
1110        // Regression: fuzz_parse crash-e5a60511db30059b55e7d7215b710fc36ec75dfb
1111        // Input "3313-32-$Ң..." matched timestamp heuristic at positions 4,7
1112        // but non-ASCII chars at positions 8-9 caused parse_iso8601 to panic
1113        // on byte slice `s[8..10]` cutting through multi-byte character Ң.
1114        let input = "02)3313-32-$\u{04A2}\u{1}\0\05";
1115        let mut lexer = Lexer::new(input);
1116        let _ = lexer.tokenize();
1117    }
1118
1119    #[test]
1120    fn test_fuzz_crash_backslash_timestamp_non_ascii() {
1121        // Regression: fuzz_parse crash-785c8b3fbc203fc7279523e1eb5c57b2341de7ea
1122        // Backslashes + date pattern with non-ASCII Ԭ chars in date positions
1123        let input = "\\\\\u{1}\0\0\n\\\\\\\\\\\\)3313-32-\\\u{052D}\u{052D}:{Y:{Y\\\\\\\\\\\\\\\\\\\\\\3m\u{00AC}m\u{00C2}5\0\05";
1124        let mut lexer = Lexer::new(input);
1125        let _ = lexer.tokenize();
1126    }
1127
1128    #[test]
1129    fn test_fuzz_crash_large_repeated_date_pattern() {
1130        // Regression: fuzz_parse crash-8684aafa13348eaeacbbd9a69ae6e02a57bc681e
1131        // 645-byte input with repeated date-like "3313-333-3332)" patterns
1132        // and non-ASCII chars interspersed. Must not panic.
1133        let input = "\"18]\")\"\"\" ]\t;=1] ]  3333-333-3332)3313-33--33331333-333313T33302)3313-333-3333)3313-333-333-3332)33-133-3-333313;-3333)3333313T33302)3313-333-3333)3313-33332)33-3333)3333313T33302)3313-333-3333)3313-333-333-323)33-\t\n\t313T33302)3333-333-3332)3313-33--33331333-333313T33302)";
1134        let mut lexer = Lexer::new(input);
1135        let _ = lexer.tokenize();
1136    }
1137
1138    #[test]
1139    fn test_fuzz_parse_iso8601_non_ascii_rejected() {
1140        // Verify parse_iso8601 rejects non-ASCII input gracefully
1141        assert!(parse_iso8601("2024-01-15T10:30:00Z").is_ok());
1142        assert!(parse_iso8601("3313-32-$\u{04A2}").is_err());
1143        assert!(parse_iso8601("2024-01-\u{052D}5").is_err());
1144        assert!(parse_iso8601("").is_err());
1145        assert!(parse_iso8601("short").is_err());
1146        // Month/day zero must be rejected (day-1 underflows u32 in days_from_epoch)
1147        assert!(parse_iso8601("2024-00-15T10:30:00Z").is_err());
1148        assert!(parse_iso8601("2024-01-00T10:30:00Z").is_err());
1149        assert!(parse_iso8601("2024-13-15T10:30:00Z").is_err());
1150        assert!(parse_iso8601("2024-00-00T10:30:00Z").is_err());
1151    }
1152
1153    #[test]
1154    fn test_fuzz_timestamp_trailing_dot() {
1155        // Timestamp ending with just a dot and no fractional digits
1156        // Should return an error (not panic) since ".Z" has no digits after dot
1157        let mut lexer = Lexer::new("2024-01-15T10:30:00.Z");
1158        let result = lexer.tokenize();
1159        assert!(result.is_err());
1160    }
1161
1162    #[test]
1163    fn test_fuzz_crash_timestamp_long_fractional_no_overflow() {
1164        // Regression: fuzz_parse crash-bc25426e70a60ec5649726a4aa65e9f6776c90fb
1165        // Timestamp with 22 fractional digits caused 10i64.pow(19) overflow.
1166        // parse_iso8601 now caps fractional parsing to 3 digits.
1167        // Bogus dates parse without panic (no range validation):
1168        let _ = parse_iso8601("3230-32-33T33016656.6563311111111111111112");
1169        // Valid timestamp with many fractional digits should not overflow
1170        let result = parse_iso8601("2024-01-15T10:30:00.123456789012345678901234567890Z");
1171        assert!(result.is_ok());
1172        // Should parse as 123 ms (first 3 digits only)
1173        assert_eq!(result.unwrap().0 % 1000, 123);
1174    }
1175
1176    #[test]
1177    fn test_fuzz_crash_bc25426e_full_parse_no_panic() {
1178        // Regression: crash-bc25426e — must not panic through TeaLeaf::parse
1179        let input = "\x00\x00\x00\x00\x00\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x001\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x00111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.6563311111111111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.65633111111111113323!:g";
1180        let _ = crate::TeaLeaf::parse(input); // Must not panic
1181    }
1182
1183    #[test]
1184    fn test_fuzz_crash_multiline_multibyte_whitespace_dedent() {
1185        // Regression: fuzz_parse crash-834ac7a271d94cf87372e9a91a9137e81ff9316a
1186        // Multiline string with mixed whitespace: \u{0B} (1 byte) and \u{0085} (2 bytes).
1187        // Old byte-based dedent sliced at byte offset 1 into the 2-byte U+0085,
1188        // panicking on invalid character boundary.
1189        let input = "*\0\"\"\"\u{0B}J\n\n\n\u{0085}\u{0B}J\n\n\n\n\n\n\n\n\"\"\" \0\n\n\n\n\n\"\"\" \0\0";
1190        let mut lexer = Lexer::new(input);
1191        let _ = lexer.tokenize(); // Must not panic
1192    }
1193
1194    #[test]
1195    fn test_multiline_string_multibyte_indent() {
1196        // Verify dedent works correctly with multi-byte whitespace characters
1197        // Both lines have 1 whitespace character of indent, but different byte widths
1198        let input = "\"\"\"\n\u{0085}A\n\u{0B}B\n\"\"\"";
1199        let mut lexer = Lexer::new(input);
1200        let tokens = lexer.tokenize().unwrap();
1201        match &tokens[0].kind {
1202            TokenKind::String(s) => {
1203                assert_eq!(s, "A\nB", "Both lines should be dedented by 1 character");
1204            }
1205            other => panic!("Expected String, got {:?}", other),
1206        }
1207    }
1208
1209    #[test]
1210    fn test_many_unknown_chars_no_stack_overflow() {
1211        // Thousands of consecutive unknown characters should not stack overflow
1212        let input: String = std::iter::repeat('\u{07FE}').take(10_000).collect();
1213        let mut lexer = Lexer::new(&input);
1214        let tokens = lexer.tokenize().unwrap();
1215        // All unknown chars skipped, only Eof remains
1216        assert_eq!(tokens.len(), 1);
1217        assert!(matches!(tokens[0].kind, TokenKind::Eof));
1218    }
1219}