1use crate::{Error, Result};
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum TokenKind {
7 Word(String),
9 String(String),
10 Bytes(Vec<u8>),
11 Int(i64),
12 UInt(u64),
13 Float(f64),
14 Bool(bool),
15 Null,
16 Timestamp(i64, i16), JsonNumber(String), LBrace,
21 RBrace,
22 LBracket,
23 RBracket,
24 LParen,
25 RParen,
26 Colon,
27 Comma,
28 Eq,
29 Question, Directive(String),
33 Ref(String),
34
35 Eof,
36}
37
38#[derive(Debug, Clone)]
39pub struct Token {
40 pub kind: TokenKind,
41 pub line: usize,
42 pub col: usize,
43}
44
45impl Token {
46 pub fn new(kind: TokenKind, line: usize, col: usize) -> Self {
47 Self { kind, line, col }
48 }
49}
50
51pub struct Lexer<'a> {
52 input: &'a str,
53 pos: usize,
54 line: usize,
55 col: usize,
56}
57
58impl<'a> Lexer<'a> {
59 pub fn new(input: &'a str) -> Self {
60 Self {
61 input,
62 pos: 0,
63 line: 1,
64 col: 1,
65 }
66 }
67
68 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
69 let mut tokens = Vec::new();
70 loop {
71 let tok = self.next_token()?;
72 let is_eof = matches!(tok.kind, TokenKind::Eof);
73 tokens.push(tok);
74 if is_eof {
75 break;
76 }
77 }
78 Ok(tokens)
79 }
80
81 fn next_token(&mut self) -> Result<Token> {
82 loop {
83 self.skip_whitespace_and_comments();
84
85 let line = self.line;
86 let col = self.col;
87
88 if self.pos >= self.input.len() {
89 return Ok(Token::new(TokenKind::Eof, line, col));
90 }
91
92 let c = match self.current_char() {
93 Some(c) => c,
94 None => return Ok(Token::new(TokenKind::Eof, line, col)),
95 };
96
97 let simple = match c {
99 '{' => Some(TokenKind::LBrace),
100 '}' => Some(TokenKind::RBrace),
101 '[' => Some(TokenKind::LBracket),
102 ']' => Some(TokenKind::RBracket),
103 '(' => Some(TokenKind::LParen),
104 ')' => Some(TokenKind::RParen),
105 ',' => Some(TokenKind::Comma),
106 '=' => Some(TokenKind::Eq),
107 '~' => Some(TokenKind::Null),
108 '?' => Some(TokenKind::Question),
109 ':' => Some(TokenKind::Colon),
110 _ => None,
111 };
112
113 if let Some(kind) = simple {
114 self.advance();
115 return Ok(Token::new(kind, line, col));
116 }
117
118 if c == '@' {
120 self.advance();
121 let word = self.read_word();
122 return Ok(Token::new(TokenKind::Directive(word), line, col));
123 }
124
125 if c == '!' {
127 self.advance();
128 let word = self.read_word();
129 return Ok(Token::new(TokenKind::Ref(word), line, col));
130 }
131
132 if c == 'b' && self.peek_char(1) == Some('"') {
134 return self.read_bytes_literal(line, col);
135 }
136
137 if c == '"' {
139 return self.read_string(line, col);
140 }
141
142 if c.is_ascii_digit() {
147 let remaining = self.input[self.pos..].as_bytes();
148 if remaining.len() >= 10
149 && remaining[0].is_ascii_digit()
150 && remaining[1].is_ascii_digit()
151 && remaining[2].is_ascii_digit()
152 && remaining[3].is_ascii_digit()
153 && remaining[4] == b'-'
154 && remaining[5].is_ascii_digit()
155 && remaining[6].is_ascii_digit()
156 && remaining[7] == b'-'
157 && remaining[8].is_ascii_digit()
158 && remaining[9].is_ascii_digit()
159 {
160 return self.read_timestamp(line, col);
161 }
162 }
163
164 if c == '-' && self.input[self.pos..].starts_with("-inf") {
166 let after = self.input.get(self.pos + 4..self.pos + 5)
168 .and_then(|s| s.chars().next());
169 if after.map_or(true, |c| !c.is_alphanumeric() && c != '_') {
170 self.pos += 4;
171 self.col += 4;
172 return Ok(Token::new(TokenKind::Float(f64::NEG_INFINITY), line, col));
173 }
174 }
175
176 if c.is_ascii_digit() || (c == '-' && self.peek_char(1).map(|c| c.is_ascii_digit()).unwrap_or(false)) {
178 return self.read_number(line, col);
179 }
180
181 if c.is_alphabetic() || c == '_' {
183 let word = self.read_word();
184 let kind = match word.as_str() {
185 "true" => TokenKind::Bool(true),
186 "false" => TokenKind::Bool(false),
187 "NaN" => TokenKind::Float(f64::NAN),
188 "inf" => TokenKind::Float(f64::INFINITY),
189 _ => TokenKind::Word(word),
190 };
191 return Ok(Token::new(kind, line, col));
192 }
193
194 self.advance();
196 }
197 }
198
199 fn current_char(&self) -> Option<char> {
200 self.input[self.pos..].chars().next()
201 }
202
203 fn peek_char(&self, offset: usize) -> Option<char> {
204 self.input[self.pos..].chars().nth(offset)
205 }
206
207 fn advance(&mut self) {
208 if let Some(c) = self.current_char() {
209 self.pos += c.len_utf8();
210 if c == '\n' {
211 self.line += 1;
212 self.col = 1;
213 } else {
214 self.col += 1;
215 }
216 }
217 }
218
219 fn skip_whitespace_and_comments(&mut self) {
220 while let Some(c) = self.current_char() {
221 if c.is_whitespace() {
222 self.advance();
223 } else if c == '#' {
224 while let Some(c) = self.current_char() {
226 if c == '\n' {
227 break;
228 }
229 self.advance();
230 }
231 } else {
232 break;
233 }
234 }
235 }
236
237 fn read_word(&mut self) -> String {
238 let start = self.pos;
239 while let Some(c) = self.current_char() {
240 if c.is_alphanumeric() || c == '_' || c == '-' || c == '.' {
241 self.advance();
242 } else {
243 break;
244 }
245 }
246 self.input[start..self.pos].to_string()
247 }
248
249 fn read_string(&mut self, line: usize, col: usize) -> Result<Token> {
250 self.advance(); if self.input[self.pos..].starts_with("\"\"") {
254 self.advance();
255 self.advance();
256 return self.read_multiline_string(line, col);
257 }
258
259 let mut value = String::new();
260 while let Some(c) = self.current_char() {
261 if c == '"' {
262 self.advance();
263 return Ok(Token::new(TokenKind::String(value), line, col));
264 } else if c == '\\' {
265 self.advance();
266 if let Some(escaped) = self.current_char() {
267 match escaped {
268 'n' => { value.push('\n'); self.advance(); }
269 't' => { value.push('\t'); self.advance(); }
270 'r' => { value.push('\r'); self.advance(); }
271 'b' => { value.push('\u{0008}'); self.advance(); }
272 'f' => { value.push('\u{000C}'); self.advance(); }
273 '"' => { value.push('"'); self.advance(); }
274 '\\' => { value.push('\\'); self.advance(); }
275 'u' => {
276 self.advance(); let start = self.pos;
278 let mut count = 0;
279 while count < 4 {
280 match self.current_char() {
281 Some(c) if c.is_ascii_hexdigit() => {
282 self.advance();
283 count += 1;
284 }
285 _ => break,
286 }
287 }
288 if count != 4 {
289 return Err(Error::ParseError(
290 "Invalid unicode escape: expected 4 hex digits after \\u".to_string()
291 ));
292 }
293 let hex = &self.input[start..self.pos];
294 let code = u32::from_str_radix(hex, 16).map_err(|_| {
295 Error::ParseError(format!("Invalid unicode escape: \\u{}", hex))
296 })?;
297 let ch = char::from_u32(code).ok_or_else(|| {
298 Error::ParseError(format!("Invalid unicode codepoint: U+{:04X}", code))
299 })?;
300 value.push(ch);
301 }
302 _ => {
303 return Err(Error::ParseError(
304 format!("Invalid escape sequence: \\{}", escaped)
305 ));
306 }
307 }
308 }
309 } else {
310 value.push(c);
311 self.advance();
312 }
313 }
314 Err(Error::ParseError("Unterminated string".to_string()))
315 }
316
317 fn read_bytes_literal(&mut self, line: usize, col: usize) -> Result<Token> {
318 self.advance(); self.advance(); let mut hex = String::new();
322 while let Some(c) = self.current_char() {
323 if c == '"' {
324 self.advance();
325 if hex.len() % 2 != 0 {
326 return Err(Error::ParseError(
327 format!("Bytes literal has odd number of hex digits ({})", hex.len())
328 ));
329 }
330 let bytes = (0..hex.len())
331 .step_by(2)
332 .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).map_err(|_|
333 Error::ParseError(format!("Invalid hex pair '{}' in bytes literal", &hex[i..i + 2]))
334 ))
335 .collect::<Result<Vec<u8>>>()?;
336 return Ok(Token::new(TokenKind::Bytes(bytes), line, col));
337 } else if c.is_ascii_hexdigit() {
338 hex.push(c);
339 self.advance();
340 } else {
341 return Err(Error::ParseError(
342 format!("Invalid character '{}' in bytes literal (expected hex digit or '\"')", c)
343 ));
344 }
345 }
346 Err(Error::ParseError("Unterminated bytes literal".to_string()))
347 }
348
349 fn read_multiline_string(&mut self, line: usize, col: usize) -> Result<Token> {
350 let start = self.pos;
351 while self.pos < self.input.len() {
352 if self.input[self.pos..].starts_with("\"\"\"") {
353 let raw = &self.input[start..self.pos];
354 self.advance();
355 self.advance();
356 self.advance();
357
358 let lines: Vec<&str> = raw.lines().collect();
360 let lines: Vec<&str> = if lines.len() > 1 && lines.first().map(|l| l.trim().is_empty()).unwrap_or(false) {
361 lines[1..].to_vec()
362 } else {
363 lines
364 };
365 let lines: Vec<&str> = if lines.len() > 1 && lines.last().map(|l| l.trim().is_empty()).unwrap_or(false) {
366 lines[..lines.len() - 1].to_vec()
367 } else {
368 lines
369 };
370
371 let min_indent = lines
374 .iter()
375 .filter(|l| !l.trim().is_empty())
376 .map(|l| l.chars().take_while(|c| c.is_whitespace()).count())
377 .min()
378 .unwrap_or(0);
379
380 let dedented: Vec<&str> = lines
381 .iter()
382 .map(|l| {
383 let byte_off: usize = l.chars().take(min_indent).map(|c| c.len_utf8()).sum();
385 if byte_off <= l.len() { &l[byte_off..] } else { *l }
386 })
387 .collect();
388
389 return Ok(Token::new(TokenKind::String(dedented.join("\n")), line, col));
390 }
391 self.advance();
392 }
393 Err(Error::ParseError("Unterminated multiline string".to_string()))
394 }
395
396 fn read_timestamp(&mut self, line: usize, col: usize) -> Result<Token> {
397 let start = self.pos;
398
399 for _ in 0..10 {
401 self.advance();
402 }
403
404 if self.current_char() == Some('T') {
406 self.advance();
407 while let Some(c) = self.current_char() {
409 if c.is_ascii_digit() || c == ':' {
410 self.advance();
411 } else {
412 break;
413 }
414 }
415 if self.current_char() == Some('.') {
417 self.advance();
418 while let Some(c) = self.current_char() {
419 if c.is_ascii_digit() {
420 self.advance();
421 } else {
422 break;
423 }
424 }
425 }
426 if self.current_char() == Some('Z') {
428 self.advance();
429 } else if self.current_char() == Some('+') || self.current_char() == Some('-') {
430 self.advance();
431 while let Some(c) = self.current_char() {
433 if c.is_ascii_digit() || c == ':' {
434 self.advance();
435 } else {
436 break;
437 }
438 }
439 }
440 }
441
442 let timestamp_str = &self.input[start..self.pos];
443 let (millis, tz_offset) = parse_iso8601(timestamp_str)
444 .map_err(|_| Error::ParseError(format!("Invalid timestamp: {}", timestamp_str)))?;
445
446 Ok(Token::new(TokenKind::Timestamp(millis, tz_offset), line, col))
447 }
448
449 fn read_number(&mut self, line: usize, col: usize) -> Result<Token> {
450 let start = self.pos;
451
452 if self.current_char() == Some('-') {
454 self.advance();
455 }
456
457 if self.input[self.pos..].starts_with("0x") || self.input[self.pos..].starts_with("0X") {
459 self.advance();
460 self.advance();
461 while let Some(c) = self.current_char() {
462 if c.is_ascii_hexdigit() {
463 self.advance();
464 } else {
465 break;
466 }
467 }
468 let s = &self.input[start..self.pos];
469 let val = if s.starts_with('-') {
470 -(i64::from_str_radix(&s[3..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?)
471 } else {
472 i64::from_str_radix(&s[2..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?
473 };
474 return Ok(Token::new(TokenKind::Int(val), line, col));
475 }
476
477 if self.input[self.pos..].starts_with("0b") || self.input[self.pos..].starts_with("0B") {
479 self.advance();
480 self.advance();
481 while let Some(c) = self.current_char() {
482 if c == '0' || c == '1' {
483 self.advance();
484 } else {
485 break;
486 }
487 }
488 let s = &self.input[start..self.pos];
489 let val = if s.starts_with('-') {
490 -(i64::from_str_radix(&s[3..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?)
491 } else {
492 i64::from_str_radix(&s[2..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?
493 };
494 return Ok(Token::new(TokenKind::Int(val), line, col));
495 }
496
497 let mut has_dot = false;
499 let mut has_exp = false;
500 while let Some(c) = self.current_char() {
501 if c.is_ascii_digit() {
502 self.advance();
503 } else if c == '.' && !has_dot && !has_exp {
504 has_dot = true;
505 self.advance();
506 } else if (c == 'e' || c == 'E') && !has_exp {
507 has_exp = true;
508 self.advance();
509 if self.current_char() == Some('+') || self.current_char() == Some('-') {
510 self.advance();
511 }
512 } else {
513 break;
514 }
515 }
516
517 let s = &self.input[start..self.pos];
518 if has_dot || has_exp {
519 let val: f64 = s.parse().map_err(|_| Error::ParseError(format!("Invalid float: {}", s)))?;
520 if val.is_finite() {
521 Ok(Token::new(TokenKind::Float(val), line, col))
522 } else {
523 Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col))
524 }
525 } else {
526 match s.parse::<i64>() {
528 Ok(val) => Ok(Token::new(TokenKind::Int(val), line, col)),
529 Err(_) => match s.parse::<u64>() {
530 Ok(val) => Ok(Token::new(TokenKind::UInt(val), line, col)),
531 Err(_) => Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col)),
532 }
533 }
534 }
535 }
536}
537
538fn parse_iso8601(s: &str) -> std::result::Result<(i64, i16), ()> {
542 if !s.is_ascii() {
545 return Err(());
546 }
547
548 if s.len() < 10 {
549 return Err(());
550 }
551
552 let year: i64 = s[0..4].parse().map_err(|_| ())?;
553 let month: u32 = s[5..7].parse().map_err(|_| ())?;
554 let day: u32 = s[8..10].parse().map_err(|_| ())?;
555 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
556 return Err(());
557 }
558
559 let time_start = 10;
560 let (hour, minute, second, millis, tz_offset_minutes) = if s.len() > time_start && s.as_bytes()[time_start] == b'T' {
561 let time_part = &s[time_start + 1..];
562 let hour: u32 = time_part.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
563 let minute: u32 = time_part.get(3..5).ok_or(())?.parse().map_err(|_| ())?;
564
565 let (second, rest_start) = if time_part.len() > 5 {
571 match time_part.as_bytes()[5] {
572 b':' => {
573 let sec: u32 = time_part.get(6..8).ok_or(())?.parse().map_err(|_| ())?;
574 (sec, 8usize)
575 }
576 b'+' | b'-' | b'Z' => (0u32, 5usize),
577 _ => (0u32, time_part.len()),
578 }
579 } else {
580 (0u32, time_part.len())
581 };
582
583 if hour > 23 || minute > 59 || second > 59 {
585 return Err(());
586 }
587
588 let mut millis = 0i64;
589 let mut rest = &time_part[rest_start.min(time_part.len())..];
590
591 if rest.starts_with('.') && rest.len() > 1 {
593 let end = rest[1..].find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len() - 1);
594 if end == 0 {
595 return Err(());
596 }
597 let frac_digits = end.min(3);
600 let ms_str = &rest[1..1 + frac_digits];
601 millis = ms_str.parse::<i64>().unwrap_or(0);
602 let digits = ms_str.len();
603 if digits < 3 {
604 millis *= 10i64.pow(3 - digits as u32);
605 }
606 rest = &rest[end + 1..];
607 } else if rest.starts_with('.') {
608 rest = &rest[1..];
610 }
611
612 let tz_offset = if rest.starts_with('Z') {
614 0i32
615 } else if rest.starts_with('+') || rest.starts_with('-') {
616 let sign: i32 = if rest.starts_with('+') { 1 } else { -1 };
617 let tz = &rest[1..];
618 let tz_hour: i32 = tz.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
619 let tz_min: i32 = if tz.len() >= 4 && tz.as_bytes()[2] == b':' {
621 tz.get(3..5).unwrap_or("00").parse().unwrap_or(0) } else if tz.len() >= 4 && tz.as_bytes()[2] != b':' {
623 tz.get(2..4).unwrap_or("00").parse().unwrap_or(0) } else {
625 0 };
627 if tz_hour > 23 || tz_min > 59 {
628 return Err(());
629 }
630 sign * (tz_hour * 60 + tz_min)
631 } else {
632 0 };
634
635 (hour, minute, second, millis, tz_offset)
636 } else {
637 (0, 0, 0, 0, 0)
638 };
639
640 let days = days_from_epoch(year, month, day);
643 let seconds = days * 86400
644 + hour as i64 * 3600
645 + minute as i64 * 60
646 + second as i64
647 - tz_offset_minutes as i64 * 60;
648
649 Ok((seconds * 1000 + millis, tz_offset_minutes as i16))
650}
651
652fn days_from_epoch(year: i64, month: u32, day: u32) -> i64 {
654 let y = if month <= 2 { year - 1 } else { year };
655 let m = if month <= 2 { month + 12 } else { month };
656 let era = if y >= 0 { y } else { y - 399 } / 400;
657 let yoe = (y - era * 400) as u32;
658 let doy = (153 * (m - 3) + 2) / 5 + day - 1;
659 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
660 era * 146097 + doe as i64 - 719468
661}
662
663#[cfg(test)]
664mod tests {
665 use super::*;
666
667 #[test]
668 fn test_simple_tokens() {
669 let mut lexer = Lexer::new("{ } [ ] ( ) : , ~");
670 let tokens = lexer.tokenize().unwrap();
671 assert!(matches!(tokens[0].kind, TokenKind::LBrace));
672 assert!(matches!(tokens[1].kind, TokenKind::RBrace));
673 assert!(matches!(tokens[2].kind, TokenKind::LBracket));
674 assert!(matches!(tokens[8].kind, TokenKind::Null));
675 }
676
677 #[test]
678 fn test_numbers() {
679 let mut lexer = Lexer::new("42 -17 3.14 0xFF 0b1010");
680 let tokens = lexer.tokenize().unwrap();
681 assert!(matches!(tokens[0].kind, TokenKind::Int(42)));
682 assert!(matches!(tokens[1].kind, TokenKind::Int(-17)));
683 assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 3.14).abs() < 0.001));
684 assert!(matches!(tokens[3].kind, TokenKind::Int(255)));
685 assert!(matches!(tokens[4].kind, TokenKind::Int(10)));
686 }
687
688 #[test]
689 fn test_strings() {
690 let mut lexer = Lexer::new(r#""hello" "world\n""#);
691 let tokens = lexer.tokenize().unwrap();
692 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
693 assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world\n"));
694 }
695
696 #[test]
697 fn test_directives() {
698 let mut lexer = Lexer::new("@struct @table");
699 let tokens = lexer.tokenize().unwrap();
700 assert!(matches!(&tokens[0].kind, TokenKind::Directive(s) if s == "struct"));
701 assert!(matches!(&tokens[1].kind, TokenKind::Directive(s) if s == "table"));
702 }
703
704 #[test]
705 fn test_references() {
706 let mut lexer = Lexer::new("!myref !another_ref");
707 let tokens = lexer.tokenize().unwrap();
708 assert!(matches!(&tokens[0].kind, TokenKind::Ref(s) if s == "myref"));
709 assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "another_ref"));
710 }
711
712 #[test]
713 fn test_comments_and_references() {
714 let mut lexer = Lexer::new("value1 # this is a comment\nvalue2");
716 let tokens = lexer.tokenize().unwrap();
717 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
718 assert!(matches!(&tokens[1].kind, TokenKind::Word(s) if s == "value2"));
719 assert!(matches!(tokens[2].kind, TokenKind::Eof));
720
721 let mut lexer = Lexer::new("value1 !ref value2");
723 let tokens = lexer.tokenize().unwrap();
724 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
725 assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "ref"));
726 assert!(matches!(&tokens[2].kind, TokenKind::Word(s) if s == "value2"));
727 }
728
729 #[test]
734 fn test_string_escape_tab() {
735 let mut lexer = Lexer::new(r#""\t""#);
736 let tokens = lexer.tokenize().unwrap();
737 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\t"));
738 }
739
740 #[test]
741 fn test_string_escape_cr() {
742 let mut lexer = Lexer::new(r#""\r""#);
743 let tokens = lexer.tokenize().unwrap();
744 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\r"));
745 }
746
747 #[test]
748 fn test_string_escape_backspace() {
749 let mut lexer = Lexer::new(r#""\b""#);
750 let tokens = lexer.tokenize().unwrap();
751 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{0008}"));
752 }
753
754 #[test]
755 fn test_string_escape_formfeed() {
756 let mut lexer = Lexer::new(r#""\f""#);
757 let tokens = lexer.tokenize().unwrap();
758 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{000C}"));
759 }
760
761 #[test]
762 fn test_string_escape_backslash() {
763 let mut lexer = Lexer::new(r#""\\""#);
764 let tokens = lexer.tokenize().unwrap();
765 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\\"));
766 }
767
768 #[test]
769 fn test_string_escape_quote() {
770 let mut lexer = Lexer::new(r#""\"hello\"""#);
771 let tokens = lexer.tokenize().unwrap();
772 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\"hello\""));
773 }
774
775 #[test]
776 fn test_string_escape_unicode() {
777 let mut lexer = Lexer::new(r#""\u0041""#);
778 let tokens = lexer.tokenize().unwrap();
779 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "A"));
780 }
781
782 #[test]
783 fn test_string_escape_unicode_emoji_range() {
784 let mut lexer = Lexer::new(r#""\u2665""#);
786 let tokens = lexer.tokenize().unwrap();
787 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{2665}"));
788 }
789
790 #[test]
791 fn test_string_invalid_escape() {
792 let mut lexer = Lexer::new(r#""\x""#);
793 let err = lexer.tokenize().unwrap_err();
794 assert!(err.to_string().contains("Invalid escape sequence"));
795 }
796
797 #[test]
798 fn test_string_invalid_unicode_short() {
799 let mut lexer = Lexer::new(r#""\u00""#);
800 let err = lexer.tokenize().unwrap_err();
801 assert!(err.to_string().contains("Invalid unicode escape"));
802 }
803
804 #[test]
805 fn test_unterminated_string() {
806 let mut lexer = Lexer::new(r#""hello"#);
807 let err = lexer.tokenize().unwrap_err();
808 assert!(err.to_string().contains("Unterminated string"));
809 }
810
811 #[test]
816 fn test_multiline_string() {
817 let input = "\"\"\"
818 hello
819 world
820\"\"\"";
821 let mut lexer = Lexer::new(input);
822 let tokens = lexer.tokenize().unwrap();
823 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s.contains("hello") && s.contains("world")));
824 }
825
826 #[test]
827 fn test_unterminated_multiline_string() {
828 let input = "\"\"\"
829 hello world";
830 let mut lexer = Lexer::new(input);
831 let err = lexer.tokenize().unwrap_err();
832 assert!(err.to_string().contains("Unterminated multiline string"));
833 }
834
835 #[test]
840 fn test_timestamp_basic() {
841 let mut lexer = Lexer::new("2024-01-15T10:30:00Z");
842 let tokens = lexer.tokenize().unwrap();
843 match &tokens[0].kind {
844 TokenKind::Timestamp(ts, _tz) => {
845 assert!(*ts > 0);
847 }
848 other => panic!("Expected Timestamp, got {:?}", other),
849 }
850 }
851
852 #[test]
853 fn test_timestamp_with_millis() {
854 let mut lexer = Lexer::new("2024-01-15T10:30:00.123Z");
855 let tokens = lexer.tokenize().unwrap();
856 match &tokens[0].kind {
857 TokenKind::Timestamp(ts, _tz) => {
858 assert_eq!(*ts % 1000, 123); }
860 other => panic!("Expected Timestamp, got {:?}", other),
861 }
862 }
863
864 #[test]
865 fn test_timestamp_date_only() {
866 let mut lexer = Lexer::new("2024-01-15");
867 let tokens = lexer.tokenize().unwrap();
868 assert!(matches!(tokens[0].kind, TokenKind::Timestamp(_, _)));
869 }
870
871 #[test]
872 fn test_timestamp_with_offset() {
873 let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
874 let tokens = lexer.tokenize().unwrap();
875 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
876 else { panic!("expected timestamp"); }
877 }
878
879 #[test]
880 fn test_timestamp_with_negative_offset() {
881 let mut lexer = Lexer::new("2024-01-15T10:30:00-08:00");
882 let tokens = lexer.tokenize().unwrap();
883 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, -480); }
884 else { panic!("expected timestamp"); }
885 }
886
887 #[test]
888 fn test_timestamp_offset_formats() {
889 let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
891 let tokens = lexer.tokenize().unwrap();
892 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
893 else { panic!("expected timestamp"); }
894
895 let mut lexer = Lexer::new("2024-01-15T10:30:00+0530");
897 let tokens = lexer.tokenize().unwrap();
898 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
899 else { panic!("expected timestamp for +HHMM"); }
900
901 let mut lexer = Lexer::new("2024-01-15T10:30:00+05");
903 let tokens = lexer.tokenize().unwrap();
904 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 300); }
905 else { panic!("expected timestamp for +HH"); }
906 }
907
908 #[test]
913 fn test_scientific_notation() {
914 let mut lexer = Lexer::new("1.5e10 2.3E-5 1e+3");
915 let tokens = lexer.tokenize().unwrap();
916 assert!(matches!(tokens[0].kind, TokenKind::Float(f) if (f - 1.5e10).abs() < 1.0));
917 assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - 2.3e-5).abs() < 1e-10));
918 assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 1e3).abs() < 1.0));
919 }
920
921 #[test]
922 fn test_binary_literal() {
923 let mut lexer = Lexer::new("0b1100 0B1010");
924 let tokens = lexer.tokenize().unwrap();
925 assert!(matches!(tokens[0].kind, TokenKind::Int(12)));
926 assert!(matches!(tokens[1].kind, TokenKind::Int(10)));
927 }
928
929 #[test]
930 fn test_hex_uppercase() {
931 let mut lexer = Lexer::new("0XDEAD");
932 let tokens = lexer.tokenize().unwrap();
933 assert!(matches!(tokens[0].kind, TokenKind::Int(0xDEAD)));
934 }
935
936 #[test]
937 fn test_negative_number() {
938 let mut lexer = Lexer::new("-42 -3.14");
939 let tokens = lexer.tokenize().unwrap();
940 assert!(matches!(tokens[0].kind, TokenKind::Int(-42)));
941 assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - (-3.14)).abs() < 0.001));
942 }
943
944 #[test]
949 fn test_colon_then_word() {
950 let mut lexer = Lexer::new(":Circle {radius: 5.0}");
952 let tokens = lexer.tokenize().unwrap();
953 assert!(matches!(tokens[0].kind, TokenKind::Colon));
954 assert!(matches!(&tokens[1].kind, TokenKind::Word(s) if s == "Circle"));
955 }
956
957 #[test]
958 fn test_colon_without_word() {
959 let mut lexer = Lexer::new(": 5");
960 let tokens = lexer.tokenize().unwrap();
961 assert!(matches!(tokens[0].kind, TokenKind::Colon));
962 }
963
964 #[test]
965 fn test_question_mark() {
966 let mut lexer = Lexer::new("string?");
967 let tokens = lexer.tokenize().unwrap();
968 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "string"));
969 assert!(matches!(tokens[1].kind, TokenKind::Question));
970 }
971
972 #[test]
973 fn test_equals_token() {
974 let mut lexer = Lexer::new("x = 5");
975 let tokens = lexer.tokenize().unwrap();
976 assert!(matches!(tokens[1].kind, TokenKind::Eq));
977 }
978
979 #[test]
980 fn test_bool_keywords() {
981 let mut lexer = Lexer::new("true false");
982 let tokens = lexer.tokenize().unwrap();
983 assert!(matches!(tokens[0].kind, TokenKind::Bool(true)));
984 assert!(matches!(tokens[1].kind, TokenKind::Bool(false)));
985 }
986
987 #[test]
988 fn test_empty_input() {
989 let mut lexer = Lexer::new("");
990 let tokens = lexer.tokenize().unwrap();
991 assert_eq!(tokens.len(), 1);
992 assert!(matches!(tokens[0].kind, TokenKind::Eof));
993 }
994
995 #[test]
996 fn test_whitespace_only() {
997 let mut lexer = Lexer::new(" \n\t ");
998 let tokens = lexer.tokenize().unwrap();
999 assert_eq!(tokens.len(), 1);
1000 assert!(matches!(tokens[0].kind, TokenKind::Eof));
1001 }
1002
1003 #[test]
1004 fn test_token_positions() {
1005 let mut lexer = Lexer::new("hello: 42");
1006 let tokens = lexer.tokenize().unwrap();
1007 assert_eq!(tokens[0].line, 1);
1008 assert_eq!(tokens[0].col, 1);
1009 }
1010
1011 #[test]
1012 fn test_all_brackets() {
1013 let mut lexer = Lexer::new("() {} []");
1014 let tokens = lexer.tokenize().unwrap();
1015 assert!(matches!(tokens[0].kind, TokenKind::LParen));
1016 assert!(matches!(tokens[1].kind, TokenKind::RParen));
1017 assert!(matches!(tokens[2].kind, TokenKind::LBrace));
1018 assert!(matches!(tokens[3].kind, TokenKind::RBrace));
1019 assert!(matches!(tokens[4].kind, TokenKind::LBracket));
1020 assert!(matches!(tokens[5].kind, TokenKind::RBracket));
1021 }
1022
1023 #[test]
1028 fn test_bytes_literal_basic() {
1029 let mut lexer = Lexer::new(r#"b"48656c6c6f""#);
1030 let tokens = lexer.tokenize().unwrap();
1031 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0x48, 0x65, 0x6c, 0x6c, 0x6f]));
1032 }
1033
1034 #[test]
1035 fn test_bytes_literal_empty() {
1036 let mut lexer = Lexer::new(r#"b"""#);
1037 let tokens = lexer.tokenize().unwrap();
1038 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b.is_empty()));
1039 }
1040
1041 #[test]
1042 fn test_bytes_literal_uppercase() {
1043 let mut lexer = Lexer::new(r#"b"CAFEF00D""#);
1044 let tokens = lexer.tokenize().unwrap();
1045 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe, 0xf0, 0x0d]));
1046 }
1047
1048 #[test]
1049 fn test_bytes_literal_mixed_case() {
1050 let mut lexer = Lexer::new(r#"b"CaFe""#);
1051 let tokens = lexer.tokenize().unwrap();
1052 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe]));
1053 }
1054
1055 #[test]
1056 fn test_bytes_literal_odd_length_error() {
1057 let mut lexer = Lexer::new(r#"b"abc""#);
1058 let err = lexer.tokenize().unwrap_err();
1059 assert!(err.to_string().contains("odd number of hex digits"), "Error: {}", err);
1060 }
1061
1062 #[test]
1063 fn test_bytes_literal_invalid_char_error() {
1064 let mut lexer = Lexer::new(r#"b"xyz""#);
1065 let err = lexer.tokenize().unwrap_err();
1066 assert!(err.to_string().contains("Invalid character"), "Error: {}", err);
1067 }
1068
1069 #[test]
1070 fn test_bytes_literal_unterminated_error() {
1071 let mut lexer = Lexer::new(r#"b"cafe"#);
1072 let err = lexer.tokenize().unwrap_err();
1073 assert!(err.to_string().contains("Unterminated bytes literal"), "Error: {}", err);
1074 }
1075
1076 #[test]
1077 fn test_bytes_literal_does_not_conflict_with_word() {
1078 let mut lexer = Lexer::new("bar baz");
1080 let tokens = lexer.tokenize().unwrap();
1081 assert!(matches!(&tokens[0].kind, TokenKind::Word(w) if w == "bar"));
1082 assert!(matches!(&tokens[1].kind, TokenKind::Word(w) if w == "baz"));
1083 }
1084
1085 #[test]
1090 fn test_fuzz_crash_unknown_chars_no_stack_overflow() {
1091 let input = "\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{3}#\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{07FE}";
1095 let mut lexer = Lexer::new(input);
1096 let _ = lexer.tokenize();
1098 }
1099
1100 #[test]
1101 fn test_fuzz_crash_timestamp_non_ascii_date() {
1102 let input = "02)3313-32-$\u{04A2}\u{1}\0\05";
1107 let mut lexer = Lexer::new(input);
1108 let _ = lexer.tokenize();
1109 }
1110
1111 #[test]
1112 fn test_fuzz_crash_backslash_timestamp_non_ascii() {
1113 let input = "\\\\\u{1}\0\0\n\\\\\\\\\\\\)3313-32-\\\u{052D}\u{052D}:{Y:{Y\\\\\\\\\\\\\\\\\\\\\\3m\u{00AC}m\u{00C2}5\0\05";
1116 let mut lexer = Lexer::new(input);
1117 let _ = lexer.tokenize();
1118 }
1119
1120 #[test]
1121 fn test_fuzz_crash_large_repeated_date_pattern() {
1122 let input = "\"18]\")\"\"\" ]\t;=1] ] 3333-333-3332)3313-33--33331333-333313T33302)3313-333-3333)3313-333-333-3332)33-133-3-333313;-3333)3333313T33302)3313-333-3333)3313-33332)33-3333)3333313T33302)3313-333-3333)3313-333-333-323)33-\t\n\t313T33302)3333-333-3332)3313-33--33331333-333313T33302)";
1126 let mut lexer = Lexer::new(input);
1127 let _ = lexer.tokenize();
1128 }
1129
1130 #[test]
1131 fn test_fuzz_parse_iso8601_non_ascii_rejected() {
1132 assert!(parse_iso8601("2024-01-15T10:30:00Z").is_ok());
1134 assert!(parse_iso8601("3313-32-$\u{04A2}").is_err());
1135 assert!(parse_iso8601("2024-01-\u{052D}5").is_err());
1136 assert!(parse_iso8601("").is_err());
1137 assert!(parse_iso8601("short").is_err());
1138 assert!(parse_iso8601("2024-00-15T10:30:00Z").is_err());
1140 assert!(parse_iso8601("2024-01-00T10:30:00Z").is_err());
1141 assert!(parse_iso8601("2024-13-15T10:30:00Z").is_err());
1142 assert!(parse_iso8601("2024-00-00T10:30:00Z").is_err());
1143 }
1144
1145 #[test]
1146 fn test_fuzz_timestamp_trailing_dot() {
1147 let mut lexer = Lexer::new("2024-01-15T10:30:00.Z");
1150 let result = lexer.tokenize();
1151 assert!(result.is_err());
1152 }
1153
1154 #[test]
1155 fn test_fuzz_crash_timestamp_long_fractional_no_overflow() {
1156 let _ = parse_iso8601("3230-32-33T33016656.6563311111111111111112");
1161 let result = parse_iso8601("2024-01-15T10:30:00.123456789012345678901234567890Z");
1163 assert!(result.is_ok());
1164 assert_eq!(result.unwrap().0 % 1000, 123);
1166 }
1167
1168 #[test]
1169 fn test_fuzz_crash_bc25426e_full_parse_no_panic() {
1170 let input = "\x00\x00\x00\x00\x00\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x001\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x00111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.6563311111111111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.65633111111111113323!:g";
1172 let _ = crate::TeaLeaf::parse(input); }
1174
1175 #[test]
1176 fn test_fuzz_crash_multiline_multibyte_whitespace_dedent() {
1177 let input = "*\0\"\"\"\u{0B}J\n\n\n\u{0085}\u{0B}J\n\n\n\n\n\n\n\n\"\"\" \0\n\n\n\n\n\"\"\" \0\0";
1182 let mut lexer = Lexer::new(input);
1183 let _ = lexer.tokenize(); }
1185
1186 #[test]
1187 fn test_multiline_string_multibyte_indent() {
1188 let input = "\"\"\"\n\u{0085}A\n\u{0B}B\n\"\"\"";
1191 let mut lexer = Lexer::new(input);
1192 let tokens = lexer.tokenize().unwrap();
1193 match &tokens[0].kind {
1194 TokenKind::String(s) => {
1195 assert_eq!(s, "A\nB", "Both lines should be dedented by 1 character");
1196 }
1197 other => panic!("Expected String, got {:?}", other),
1198 }
1199 }
1200
1201 #[test]
1202 fn test_many_unknown_chars_no_stack_overflow() {
1203 let input: String = std::iter::repeat('\u{07FE}').take(10_000).collect();
1205 let mut lexer = Lexer::new(&input);
1206 let tokens = lexer.tokenize().unwrap();
1207 assert_eq!(tokens.len(), 1);
1209 assert!(matches!(tokens[0].kind, TokenKind::Eof));
1210 }
1211}