1use crate::{Error, Result};
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum TokenKind {
7 Word(String),
9 String(String),
10 Bytes(Vec<u8>),
11 Int(i64),
12 UInt(u64),
13 Float(f64),
14 Bool(bool),
15 Null,
16 Timestamp(i64, i16), JsonNumber(String), LBrace,
21 RBrace,
22 LBracket,
23 RBracket,
24 LParen,
25 RParen,
26 Colon,
27 Comma,
28 Eq,
29 Question, Directive(String),
33 Tag(String),
34 Ref(String),
35
36 Eof,
37}
38
39#[derive(Debug, Clone)]
40pub struct Token {
41 pub kind: TokenKind,
42 pub line: usize,
43 pub col: usize,
44}
45
46impl Token {
47 pub fn new(kind: TokenKind, line: usize, col: usize) -> Self {
48 Self { kind, line, col }
49 }
50}
51
52pub struct Lexer<'a> {
53 input: &'a str,
54 pos: usize,
55 line: usize,
56 col: usize,
57}
58
59impl<'a> Lexer<'a> {
60 pub fn new(input: &'a str) -> Self {
61 Self {
62 input,
63 pos: 0,
64 line: 1,
65 col: 1,
66 }
67 }
68
69 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
70 let mut tokens = Vec::new();
71 loop {
72 let tok = self.next_token()?;
73 let is_eof = matches!(tok.kind, TokenKind::Eof);
74 tokens.push(tok);
75 if is_eof {
76 break;
77 }
78 }
79 Ok(tokens)
80 }
81
82 fn next_token(&mut self) -> Result<Token> {
83 loop {
84 self.skip_whitespace_and_comments();
85
86 let line = self.line;
87 let col = self.col;
88
89 if self.pos >= self.input.len() {
90 return Ok(Token::new(TokenKind::Eof, line, col));
91 }
92
93 let c = match self.current_char() {
94 Some(c) => c,
95 None => return Ok(Token::new(TokenKind::Eof, line, col)),
96 };
97
98 let simple = match c {
100 '{' => Some(TokenKind::LBrace),
101 '}' => Some(TokenKind::RBrace),
102 '[' => Some(TokenKind::LBracket),
103 ']' => Some(TokenKind::RBracket),
104 '(' => Some(TokenKind::LParen),
105 ')' => Some(TokenKind::RParen),
106 ',' => Some(TokenKind::Comma),
107 '=' => Some(TokenKind::Eq),
108 '~' => Some(TokenKind::Null),
109 '?' => Some(TokenKind::Question),
110 _ => None,
111 };
112
113 if let Some(kind) = simple {
114 self.advance();
115 return Ok(Token::new(kind, line, col));
116 }
117
118 if c == ':' {
120 self.advance();
121 if self.current_char().map(|c| c.is_alphabetic() || c == '_').unwrap_or(false) {
122 let word = self.read_word();
123 return Ok(Token::new(TokenKind::Tag(word), line, col));
124 }
125 return Ok(Token::new(TokenKind::Colon, line, col));
126 }
127
128 if c == '@' {
130 self.advance();
131 let word = self.read_word();
132 return Ok(Token::new(TokenKind::Directive(word), line, col));
133 }
134
135 if c == '!' {
137 self.advance();
138 let word = self.read_word();
139 return Ok(Token::new(TokenKind::Ref(word), line, col));
140 }
141
142 if c == 'b' && self.peek_char(1) == Some('"') {
144 return self.read_bytes_literal(line, col);
145 }
146
147 if c == '"' {
149 return self.read_string(line, col);
150 }
151
152 if c.is_ascii_digit() {
157 let remaining = self.input[self.pos..].as_bytes();
158 if remaining.len() >= 10
159 && remaining[0].is_ascii_digit()
160 && remaining[1].is_ascii_digit()
161 && remaining[2].is_ascii_digit()
162 && remaining[3].is_ascii_digit()
163 && remaining[4] == b'-'
164 && remaining[5].is_ascii_digit()
165 && remaining[6].is_ascii_digit()
166 && remaining[7] == b'-'
167 && remaining[8].is_ascii_digit()
168 && remaining[9].is_ascii_digit()
169 {
170 return self.read_timestamp(line, col);
171 }
172 }
173
174 if c == '-' && self.input[self.pos..].starts_with("-inf") {
176 let after = self.input.get(self.pos + 4..self.pos + 5)
178 .and_then(|s| s.chars().next());
179 if after.map_or(true, |c| !c.is_alphanumeric() && c != '_') {
180 self.pos += 4;
181 self.col += 4;
182 return Ok(Token::new(TokenKind::Float(f64::NEG_INFINITY), line, col));
183 }
184 }
185
186 if c.is_ascii_digit() || (c == '-' && self.peek_char(1).map(|c| c.is_ascii_digit()).unwrap_or(false)) {
188 return self.read_number(line, col);
189 }
190
191 if c.is_alphabetic() || c == '_' {
193 let word = self.read_word();
194 let kind = match word.as_str() {
195 "true" => TokenKind::Bool(true),
196 "false" => TokenKind::Bool(false),
197 "NaN" => TokenKind::Float(f64::NAN),
198 "inf" => TokenKind::Float(f64::INFINITY),
199 _ => TokenKind::Word(word),
200 };
201 return Ok(Token::new(kind, line, col));
202 }
203
204 self.advance();
206 }
207 }
208
209 fn current_char(&self) -> Option<char> {
210 self.input[self.pos..].chars().next()
211 }
212
213 fn peek_char(&self, offset: usize) -> Option<char> {
214 self.input[self.pos..].chars().nth(offset)
215 }
216
217 fn advance(&mut self) {
218 if let Some(c) = self.current_char() {
219 self.pos += c.len_utf8();
220 if c == '\n' {
221 self.line += 1;
222 self.col = 1;
223 } else {
224 self.col += 1;
225 }
226 }
227 }
228
229 fn skip_whitespace_and_comments(&mut self) {
230 while let Some(c) = self.current_char() {
231 if c.is_whitespace() {
232 self.advance();
233 } else if c == '#' {
234 while let Some(c) = self.current_char() {
236 if c == '\n' {
237 break;
238 }
239 self.advance();
240 }
241 } else {
242 break;
243 }
244 }
245 }
246
247 fn read_word(&mut self) -> String {
248 let start = self.pos;
249 while let Some(c) = self.current_char() {
250 if c.is_alphanumeric() || c == '_' || c == '-' || c == '.' {
251 self.advance();
252 } else {
253 break;
254 }
255 }
256 self.input[start..self.pos].to_string()
257 }
258
259 fn read_string(&mut self, line: usize, col: usize) -> Result<Token> {
260 self.advance(); if self.input[self.pos..].starts_with("\"\"") {
264 self.advance();
265 self.advance();
266 return self.read_multiline_string(line, col);
267 }
268
269 let mut value = String::new();
270 while let Some(c) = self.current_char() {
271 if c == '"' {
272 self.advance();
273 return Ok(Token::new(TokenKind::String(value), line, col));
274 } else if c == '\\' {
275 self.advance();
276 if let Some(escaped) = self.current_char() {
277 match escaped {
278 'n' => { value.push('\n'); self.advance(); }
279 't' => { value.push('\t'); self.advance(); }
280 'r' => { value.push('\r'); self.advance(); }
281 'b' => { value.push('\u{0008}'); self.advance(); }
282 'f' => { value.push('\u{000C}'); self.advance(); }
283 '"' => { value.push('"'); self.advance(); }
284 '\\' => { value.push('\\'); self.advance(); }
285 'u' => {
286 self.advance(); let start = self.pos;
288 let mut count = 0;
289 while count < 4 {
290 match self.current_char() {
291 Some(c) if c.is_ascii_hexdigit() => {
292 self.advance();
293 count += 1;
294 }
295 _ => break,
296 }
297 }
298 if count != 4 {
299 return Err(Error::ParseError(
300 "Invalid unicode escape: expected 4 hex digits after \\u".to_string()
301 ));
302 }
303 let hex = &self.input[start..self.pos];
304 let code = u32::from_str_radix(hex, 16).map_err(|_| {
305 Error::ParseError(format!("Invalid unicode escape: \\u{}", hex))
306 })?;
307 let ch = char::from_u32(code).ok_or_else(|| {
308 Error::ParseError(format!("Invalid unicode codepoint: U+{:04X}", code))
309 })?;
310 value.push(ch);
311 }
312 _ => {
313 return Err(Error::ParseError(
314 format!("Invalid escape sequence: \\{}", escaped)
315 ));
316 }
317 }
318 }
319 } else {
320 value.push(c);
321 self.advance();
322 }
323 }
324 Err(Error::ParseError("Unterminated string".to_string()))
325 }
326
327 fn read_bytes_literal(&mut self, line: usize, col: usize) -> Result<Token> {
328 self.advance(); self.advance(); let mut hex = String::new();
332 while let Some(c) = self.current_char() {
333 if c == '"' {
334 self.advance();
335 if hex.len() % 2 != 0 {
336 return Err(Error::ParseError(
337 format!("Bytes literal has odd number of hex digits ({})", hex.len())
338 ));
339 }
340 let bytes = (0..hex.len())
341 .step_by(2)
342 .map(|i| u8::from_str_radix(&hex[i..i + 2], 16).map_err(|_|
343 Error::ParseError(format!("Invalid hex pair '{}' in bytes literal", &hex[i..i + 2]))
344 ))
345 .collect::<Result<Vec<u8>>>()?;
346 return Ok(Token::new(TokenKind::Bytes(bytes), line, col));
347 } else if c.is_ascii_hexdigit() {
348 hex.push(c);
349 self.advance();
350 } else {
351 return Err(Error::ParseError(
352 format!("Invalid character '{}' in bytes literal (expected hex digit or '\"')", c)
353 ));
354 }
355 }
356 Err(Error::ParseError("Unterminated bytes literal".to_string()))
357 }
358
359 fn read_multiline_string(&mut self, line: usize, col: usize) -> Result<Token> {
360 let start = self.pos;
361 while self.pos < self.input.len() {
362 if self.input[self.pos..].starts_with("\"\"\"") {
363 let raw = &self.input[start..self.pos];
364 self.advance();
365 self.advance();
366 self.advance();
367
368 let lines: Vec<&str> = raw.lines().collect();
370 let lines: Vec<&str> = if lines.len() > 1 && lines.first().map(|l| l.trim().is_empty()).unwrap_or(false) {
371 lines[1..].to_vec()
372 } else {
373 lines
374 };
375 let lines: Vec<&str> = if lines.len() > 1 && lines.last().map(|l| l.trim().is_empty()).unwrap_or(false) {
376 lines[..lines.len() - 1].to_vec()
377 } else {
378 lines
379 };
380
381 let min_indent = lines
384 .iter()
385 .filter(|l| !l.trim().is_empty())
386 .map(|l| l.chars().take_while(|c| c.is_whitespace()).count())
387 .min()
388 .unwrap_or(0);
389
390 let dedented: Vec<&str> = lines
391 .iter()
392 .map(|l| {
393 let byte_off: usize = l.chars().take(min_indent).map(|c| c.len_utf8()).sum();
395 if byte_off <= l.len() { &l[byte_off..] } else { *l }
396 })
397 .collect();
398
399 return Ok(Token::new(TokenKind::String(dedented.join("\n")), line, col));
400 }
401 self.advance();
402 }
403 Err(Error::ParseError("Unterminated multiline string".to_string()))
404 }
405
406 fn read_timestamp(&mut self, line: usize, col: usize) -> Result<Token> {
407 let start = self.pos;
408
409 for _ in 0..10 {
411 self.advance();
412 }
413
414 if self.current_char() == Some('T') {
416 self.advance();
417 while let Some(c) = self.current_char() {
419 if c.is_ascii_digit() || c == ':' {
420 self.advance();
421 } else {
422 break;
423 }
424 }
425 if self.current_char() == Some('.') {
427 self.advance();
428 while let Some(c) = self.current_char() {
429 if c.is_ascii_digit() {
430 self.advance();
431 } else {
432 break;
433 }
434 }
435 }
436 if self.current_char() == Some('Z') {
438 self.advance();
439 } else if self.current_char() == Some('+') || self.current_char() == Some('-') {
440 self.advance();
441 while let Some(c) = self.current_char() {
443 if c.is_ascii_digit() || c == ':' {
444 self.advance();
445 } else {
446 break;
447 }
448 }
449 }
450 }
451
452 let timestamp_str = &self.input[start..self.pos];
453 let (millis, tz_offset) = parse_iso8601(timestamp_str)
454 .map_err(|_| Error::ParseError(format!("Invalid timestamp: {}", timestamp_str)))?;
455
456 Ok(Token::new(TokenKind::Timestamp(millis, tz_offset), line, col))
457 }
458
459 fn read_number(&mut self, line: usize, col: usize) -> Result<Token> {
460 let start = self.pos;
461
462 if self.current_char() == Some('-') {
464 self.advance();
465 }
466
467 if self.input[self.pos..].starts_with("0x") || self.input[self.pos..].starts_with("0X") {
469 self.advance();
470 self.advance();
471 while let Some(c) = self.current_char() {
472 if c.is_ascii_hexdigit() {
473 self.advance();
474 } else {
475 break;
476 }
477 }
478 let s = &self.input[start..self.pos];
479 let val = if s.starts_with('-') {
480 -(i64::from_str_radix(&s[3..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?)
481 } else {
482 i64::from_str_radix(&s[2..], 16).map_err(|_| Error::ParseError(format!("Invalid hex: {}", s)))?
483 };
484 return Ok(Token::new(TokenKind::Int(val), line, col));
485 }
486
487 if self.input[self.pos..].starts_with("0b") || self.input[self.pos..].starts_with("0B") {
489 self.advance();
490 self.advance();
491 while let Some(c) = self.current_char() {
492 if c == '0' || c == '1' {
493 self.advance();
494 } else {
495 break;
496 }
497 }
498 let s = &self.input[start..self.pos];
499 let val = if s.starts_with('-') {
500 -(i64::from_str_radix(&s[3..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?)
501 } else {
502 i64::from_str_radix(&s[2..], 2).map_err(|_| Error::ParseError(format!("Invalid binary: {}", s)))?
503 };
504 return Ok(Token::new(TokenKind::Int(val), line, col));
505 }
506
507 let mut has_dot = false;
509 let mut has_exp = false;
510 while let Some(c) = self.current_char() {
511 if c.is_ascii_digit() {
512 self.advance();
513 } else if c == '.' && !has_dot && !has_exp {
514 has_dot = true;
515 self.advance();
516 } else if (c == 'e' || c == 'E') && !has_exp {
517 has_exp = true;
518 self.advance();
519 if self.current_char() == Some('+') || self.current_char() == Some('-') {
520 self.advance();
521 }
522 } else {
523 break;
524 }
525 }
526
527 let s = &self.input[start..self.pos];
528 if has_dot || has_exp {
529 let val: f64 = s.parse().map_err(|_| Error::ParseError(format!("Invalid float: {}", s)))?;
530 if val.is_finite() {
531 Ok(Token::new(TokenKind::Float(val), line, col))
532 } else {
533 Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col))
534 }
535 } else {
536 match s.parse::<i64>() {
538 Ok(val) => Ok(Token::new(TokenKind::Int(val), line, col)),
539 Err(_) => match s.parse::<u64>() {
540 Ok(val) => Ok(Token::new(TokenKind::UInt(val), line, col)),
541 Err(_) => Ok(Token::new(TokenKind::JsonNumber(s.to_string()), line, col)),
542 }
543 }
544 }
545 }
546}
547
548fn parse_iso8601(s: &str) -> std::result::Result<(i64, i16), ()> {
552 if !s.is_ascii() {
555 return Err(());
556 }
557
558 if s.len() < 10 {
559 return Err(());
560 }
561
562 let year: i64 = s[0..4].parse().map_err(|_| ())?;
563 let month: u32 = s[5..7].parse().map_err(|_| ())?;
564 let day: u32 = s[8..10].parse().map_err(|_| ())?;
565 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
566 return Err(());
567 }
568
569 let time_start = 10;
570 let (hour, minute, second, millis, tz_offset_minutes) = if s.len() > time_start && s.as_bytes()[time_start] == b'T' {
571 let time_part = &s[time_start + 1..];
572 let hour: u32 = time_part.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
573 let minute: u32 = time_part.get(3..5).ok_or(())?.parse().map_err(|_| ())?;
574
575 let (second, rest_start) = if time_part.len() > 5 {
581 match time_part.as_bytes()[5] {
582 b':' => {
583 let sec: u32 = time_part.get(6..8).ok_or(())?.parse().map_err(|_| ())?;
584 (sec, 8usize)
585 }
586 b'+' | b'-' | b'Z' => (0u32, 5usize),
587 _ => (0u32, time_part.len()),
588 }
589 } else {
590 (0u32, time_part.len())
591 };
592
593 if hour > 23 || minute > 59 || second > 59 {
595 return Err(());
596 }
597
598 let mut millis = 0i64;
599 let mut rest = &time_part[rest_start.min(time_part.len())..];
600
601 if rest.starts_with('.') && rest.len() > 1 {
603 let end = rest[1..].find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len() - 1);
604 if end == 0 {
605 return Err(());
606 }
607 let frac_digits = end.min(3);
610 let ms_str = &rest[1..1 + frac_digits];
611 millis = ms_str.parse::<i64>().unwrap_or(0);
612 let digits = ms_str.len();
613 if digits < 3 {
614 millis *= 10i64.pow(3 - digits as u32);
615 }
616 rest = &rest[end + 1..];
617 } else if rest.starts_with('.') {
618 rest = &rest[1..];
620 }
621
622 let tz_offset = if rest.starts_with('Z') {
624 0i32
625 } else if rest.starts_with('+') || rest.starts_with('-') {
626 let sign: i32 = if rest.starts_with('+') { 1 } else { -1 };
627 let tz = &rest[1..];
628 let tz_hour: i32 = tz.get(0..2).ok_or(())?.parse().map_err(|_| ())?;
629 let tz_min: i32 = if tz.len() >= 4 && tz.as_bytes()[2] == b':' {
631 tz.get(3..5).unwrap_or("00").parse().unwrap_or(0) } else if tz.len() >= 4 && tz.as_bytes()[2] != b':' {
633 tz.get(2..4).unwrap_or("00").parse().unwrap_or(0) } else {
635 0 };
637 if tz_hour > 23 || tz_min > 59 {
638 return Err(());
639 }
640 sign * (tz_hour * 60 + tz_min)
641 } else {
642 0 };
644
645 (hour, minute, second, millis, tz_offset)
646 } else {
647 (0, 0, 0, 0, 0)
648 };
649
650 let days = days_from_epoch(year, month, day);
653 let seconds = days * 86400
654 + hour as i64 * 3600
655 + minute as i64 * 60
656 + second as i64
657 - tz_offset_minutes as i64 * 60;
658
659 Ok((seconds * 1000 + millis, tz_offset_minutes as i16))
660}
661
662fn days_from_epoch(year: i64, month: u32, day: u32) -> i64 {
664 let y = if month <= 2 { year - 1 } else { year };
665 let m = if month <= 2 { month + 12 } else { month };
666 let era = if y >= 0 { y } else { y - 399 } / 400;
667 let yoe = (y - era * 400) as u32;
668 let doy = (153 * (m - 3) + 2) / 5 + day - 1;
669 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
670 era * 146097 + doe as i64 - 719468
671}
672
673#[cfg(test)]
674mod tests {
675 use super::*;
676
677 #[test]
678 fn test_simple_tokens() {
679 let mut lexer = Lexer::new("{ } [ ] ( ) : , ~");
680 let tokens = lexer.tokenize().unwrap();
681 assert!(matches!(tokens[0].kind, TokenKind::LBrace));
682 assert!(matches!(tokens[1].kind, TokenKind::RBrace));
683 assert!(matches!(tokens[2].kind, TokenKind::LBracket));
684 assert!(matches!(tokens[8].kind, TokenKind::Null));
685 }
686
687 #[test]
688 fn test_numbers() {
689 let mut lexer = Lexer::new("42 -17 3.14 0xFF 0b1010");
690 let tokens = lexer.tokenize().unwrap();
691 assert!(matches!(tokens[0].kind, TokenKind::Int(42)));
692 assert!(matches!(tokens[1].kind, TokenKind::Int(-17)));
693 assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 3.14).abs() < 0.001));
694 assert!(matches!(tokens[3].kind, TokenKind::Int(255)));
695 assert!(matches!(tokens[4].kind, TokenKind::Int(10)));
696 }
697
698 #[test]
699 fn test_strings() {
700 let mut lexer = Lexer::new(r#""hello" "world\n""#);
701 let tokens = lexer.tokenize().unwrap();
702 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
703 assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world\n"));
704 }
705
706 #[test]
707 fn test_directives() {
708 let mut lexer = Lexer::new("@struct @table");
709 let tokens = lexer.tokenize().unwrap();
710 assert!(matches!(&tokens[0].kind, TokenKind::Directive(s) if s == "struct"));
711 assert!(matches!(&tokens[1].kind, TokenKind::Directive(s) if s == "table"));
712 }
713
714 #[test]
715 fn test_references() {
716 let mut lexer = Lexer::new("!myref !another_ref");
717 let tokens = lexer.tokenize().unwrap();
718 assert!(matches!(&tokens[0].kind, TokenKind::Ref(s) if s == "myref"));
719 assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "another_ref"));
720 }
721
722 #[test]
723 fn test_comments_and_references() {
724 let mut lexer = Lexer::new("value1 # this is a comment\nvalue2");
726 let tokens = lexer.tokenize().unwrap();
727 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
728 assert!(matches!(&tokens[1].kind, TokenKind::Word(s) if s == "value2"));
729 assert!(matches!(tokens[2].kind, TokenKind::Eof));
730
731 let mut lexer = Lexer::new("value1 !ref value2");
733 let tokens = lexer.tokenize().unwrap();
734 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "value1"));
735 assert!(matches!(&tokens[1].kind, TokenKind::Ref(s) if s == "ref"));
736 assert!(matches!(&tokens[2].kind, TokenKind::Word(s) if s == "value2"));
737 }
738
739 #[test]
744 fn test_string_escape_tab() {
745 let mut lexer = Lexer::new(r#""\t""#);
746 let tokens = lexer.tokenize().unwrap();
747 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\t"));
748 }
749
750 #[test]
751 fn test_string_escape_cr() {
752 let mut lexer = Lexer::new(r#""\r""#);
753 let tokens = lexer.tokenize().unwrap();
754 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\r"));
755 }
756
757 #[test]
758 fn test_string_escape_backspace() {
759 let mut lexer = Lexer::new(r#""\b""#);
760 let tokens = lexer.tokenize().unwrap();
761 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{0008}"));
762 }
763
764 #[test]
765 fn test_string_escape_formfeed() {
766 let mut lexer = Lexer::new(r#""\f""#);
767 let tokens = lexer.tokenize().unwrap();
768 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{000C}"));
769 }
770
771 #[test]
772 fn test_string_escape_backslash() {
773 let mut lexer = Lexer::new(r#""\\""#);
774 let tokens = lexer.tokenize().unwrap();
775 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\\"));
776 }
777
778 #[test]
779 fn test_string_escape_quote() {
780 let mut lexer = Lexer::new(r#""\"hello\"""#);
781 let tokens = lexer.tokenize().unwrap();
782 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\"hello\""));
783 }
784
785 #[test]
786 fn test_string_escape_unicode() {
787 let mut lexer = Lexer::new(r#""\u0041""#);
788 let tokens = lexer.tokenize().unwrap();
789 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "A"));
790 }
791
792 #[test]
793 fn test_string_escape_unicode_emoji_range() {
794 let mut lexer = Lexer::new(r#""\u2665""#);
796 let tokens = lexer.tokenize().unwrap();
797 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "\u{2665}"));
798 }
799
800 #[test]
801 fn test_string_invalid_escape() {
802 let mut lexer = Lexer::new(r#""\x""#);
803 let err = lexer.tokenize().unwrap_err();
804 assert!(err.to_string().contains("Invalid escape sequence"));
805 }
806
807 #[test]
808 fn test_string_invalid_unicode_short() {
809 let mut lexer = Lexer::new(r#""\u00""#);
810 let err = lexer.tokenize().unwrap_err();
811 assert!(err.to_string().contains("Invalid unicode escape"));
812 }
813
814 #[test]
815 fn test_unterminated_string() {
816 let mut lexer = Lexer::new(r#""hello"#);
817 let err = lexer.tokenize().unwrap_err();
818 assert!(err.to_string().contains("Unterminated string"));
819 }
820
821 #[test]
826 fn test_multiline_string() {
827 let input = "\"\"\"
828 hello
829 world
830\"\"\"";
831 let mut lexer = Lexer::new(input);
832 let tokens = lexer.tokenize().unwrap();
833 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s.contains("hello") && s.contains("world")));
834 }
835
836 #[test]
837 fn test_unterminated_multiline_string() {
838 let input = "\"\"\"
839 hello world";
840 let mut lexer = Lexer::new(input);
841 let err = lexer.tokenize().unwrap_err();
842 assert!(err.to_string().contains("Unterminated multiline string"));
843 }
844
845 #[test]
850 fn test_timestamp_basic() {
851 let mut lexer = Lexer::new("2024-01-15T10:30:00Z");
852 let tokens = lexer.tokenize().unwrap();
853 match &tokens[0].kind {
854 TokenKind::Timestamp(ts, _tz) => {
855 assert!(*ts > 0);
857 }
858 other => panic!("Expected Timestamp, got {:?}", other),
859 }
860 }
861
862 #[test]
863 fn test_timestamp_with_millis() {
864 let mut lexer = Lexer::new("2024-01-15T10:30:00.123Z");
865 let tokens = lexer.tokenize().unwrap();
866 match &tokens[0].kind {
867 TokenKind::Timestamp(ts, _tz) => {
868 assert_eq!(*ts % 1000, 123); }
870 other => panic!("Expected Timestamp, got {:?}", other),
871 }
872 }
873
874 #[test]
875 fn test_timestamp_date_only() {
876 let mut lexer = Lexer::new("2024-01-15");
877 let tokens = lexer.tokenize().unwrap();
878 assert!(matches!(tokens[0].kind, TokenKind::Timestamp(_, _)));
879 }
880
881 #[test]
882 fn test_timestamp_with_offset() {
883 let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
884 let tokens = lexer.tokenize().unwrap();
885 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
886 else { panic!("expected timestamp"); }
887 }
888
889 #[test]
890 fn test_timestamp_with_negative_offset() {
891 let mut lexer = Lexer::new("2024-01-15T10:30:00-08:00");
892 let tokens = lexer.tokenize().unwrap();
893 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, -480); }
894 else { panic!("expected timestamp"); }
895 }
896
897 #[test]
898 fn test_timestamp_offset_formats() {
899 let mut lexer = Lexer::new("2024-01-15T10:30:00+05:30");
901 let tokens = lexer.tokenize().unwrap();
902 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
903 else { panic!("expected timestamp"); }
904
905 let mut lexer = Lexer::new("2024-01-15T10:30:00+0530");
907 let tokens = lexer.tokenize().unwrap();
908 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 330); }
909 else { panic!("expected timestamp for +HHMM"); }
910
911 let mut lexer = Lexer::new("2024-01-15T10:30:00+05");
913 let tokens = lexer.tokenize().unwrap();
914 if let TokenKind::Timestamp(_, tz) = tokens[0].kind { assert_eq!(tz, 300); }
915 else { panic!("expected timestamp for +HH"); }
916 }
917
918 #[test]
923 fn test_scientific_notation() {
924 let mut lexer = Lexer::new("1.5e10 2.3E-5 1e+3");
925 let tokens = lexer.tokenize().unwrap();
926 assert!(matches!(tokens[0].kind, TokenKind::Float(f) if (f - 1.5e10).abs() < 1.0));
927 assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - 2.3e-5).abs() < 1e-10));
928 assert!(matches!(tokens[2].kind, TokenKind::Float(f) if (f - 1e3).abs() < 1.0));
929 }
930
931 #[test]
932 fn test_binary_literal() {
933 let mut lexer = Lexer::new("0b1100 0B1010");
934 let tokens = lexer.tokenize().unwrap();
935 assert!(matches!(tokens[0].kind, TokenKind::Int(12)));
936 assert!(matches!(tokens[1].kind, TokenKind::Int(10)));
937 }
938
939 #[test]
940 fn test_hex_uppercase() {
941 let mut lexer = Lexer::new("0XDEAD");
942 let tokens = lexer.tokenize().unwrap();
943 assert!(matches!(tokens[0].kind, TokenKind::Int(0xDEAD)));
944 }
945
946 #[test]
947 fn test_negative_number() {
948 let mut lexer = Lexer::new("-42 -3.14");
949 let tokens = lexer.tokenize().unwrap();
950 assert!(matches!(tokens[0].kind, TokenKind::Int(-42)));
951 assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - (-3.14)).abs() < 0.001));
952 }
953
954 #[test]
959 fn test_tag_token() {
960 let mut lexer = Lexer::new(":Circle {radius: 5.0}");
961 let tokens = lexer.tokenize().unwrap();
962 assert!(matches!(&tokens[0].kind, TokenKind::Tag(s) if s == "Circle"));
963 }
964
965 #[test]
966 fn test_colon_without_word() {
967 let mut lexer = Lexer::new(": 5");
968 let tokens = lexer.tokenize().unwrap();
969 assert!(matches!(tokens[0].kind, TokenKind::Colon));
970 }
971
972 #[test]
973 fn test_question_mark() {
974 let mut lexer = Lexer::new("string?");
975 let tokens = lexer.tokenize().unwrap();
976 assert!(matches!(&tokens[0].kind, TokenKind::Word(s) if s == "string"));
977 assert!(matches!(tokens[1].kind, TokenKind::Question));
978 }
979
980 #[test]
981 fn test_equals_token() {
982 let mut lexer = Lexer::new("x = 5");
983 let tokens = lexer.tokenize().unwrap();
984 assert!(matches!(tokens[1].kind, TokenKind::Eq));
985 }
986
987 #[test]
988 fn test_bool_keywords() {
989 let mut lexer = Lexer::new("true false");
990 let tokens = lexer.tokenize().unwrap();
991 assert!(matches!(tokens[0].kind, TokenKind::Bool(true)));
992 assert!(matches!(tokens[1].kind, TokenKind::Bool(false)));
993 }
994
995 #[test]
996 fn test_empty_input() {
997 let mut lexer = Lexer::new("");
998 let tokens = lexer.tokenize().unwrap();
999 assert_eq!(tokens.len(), 1);
1000 assert!(matches!(tokens[0].kind, TokenKind::Eof));
1001 }
1002
1003 #[test]
1004 fn test_whitespace_only() {
1005 let mut lexer = Lexer::new(" \n\t ");
1006 let tokens = lexer.tokenize().unwrap();
1007 assert_eq!(tokens.len(), 1);
1008 assert!(matches!(tokens[0].kind, TokenKind::Eof));
1009 }
1010
1011 #[test]
1012 fn test_token_positions() {
1013 let mut lexer = Lexer::new("hello: 42");
1014 let tokens = lexer.tokenize().unwrap();
1015 assert_eq!(tokens[0].line, 1);
1016 assert_eq!(tokens[0].col, 1);
1017 }
1018
1019 #[test]
1020 fn test_all_brackets() {
1021 let mut lexer = Lexer::new("() {} []");
1022 let tokens = lexer.tokenize().unwrap();
1023 assert!(matches!(tokens[0].kind, TokenKind::LParen));
1024 assert!(matches!(tokens[1].kind, TokenKind::RParen));
1025 assert!(matches!(tokens[2].kind, TokenKind::LBrace));
1026 assert!(matches!(tokens[3].kind, TokenKind::RBrace));
1027 assert!(matches!(tokens[4].kind, TokenKind::LBracket));
1028 assert!(matches!(tokens[5].kind, TokenKind::RBracket));
1029 }
1030
1031 #[test]
1036 fn test_bytes_literal_basic() {
1037 let mut lexer = Lexer::new(r#"b"48656c6c6f""#);
1038 let tokens = lexer.tokenize().unwrap();
1039 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0x48, 0x65, 0x6c, 0x6c, 0x6f]));
1040 }
1041
1042 #[test]
1043 fn test_bytes_literal_empty() {
1044 let mut lexer = Lexer::new(r#"b"""#);
1045 let tokens = lexer.tokenize().unwrap();
1046 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b.is_empty()));
1047 }
1048
1049 #[test]
1050 fn test_bytes_literal_uppercase() {
1051 let mut lexer = Lexer::new(r#"b"CAFEF00D""#);
1052 let tokens = lexer.tokenize().unwrap();
1053 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe, 0xf0, 0x0d]));
1054 }
1055
1056 #[test]
1057 fn test_bytes_literal_mixed_case() {
1058 let mut lexer = Lexer::new(r#"b"CaFe""#);
1059 let tokens = lexer.tokenize().unwrap();
1060 assert!(matches!(&tokens[0].kind, TokenKind::Bytes(b) if b == &[0xca, 0xfe]));
1061 }
1062
1063 #[test]
1064 fn test_bytes_literal_odd_length_error() {
1065 let mut lexer = Lexer::new(r#"b"abc""#);
1066 let err = lexer.tokenize().unwrap_err();
1067 assert!(err.to_string().contains("odd number of hex digits"), "Error: {}", err);
1068 }
1069
1070 #[test]
1071 fn test_bytes_literal_invalid_char_error() {
1072 let mut lexer = Lexer::new(r#"b"xyz""#);
1073 let err = lexer.tokenize().unwrap_err();
1074 assert!(err.to_string().contains("Invalid character"), "Error: {}", err);
1075 }
1076
1077 #[test]
1078 fn test_bytes_literal_unterminated_error() {
1079 let mut lexer = Lexer::new(r#"b"cafe"#);
1080 let err = lexer.tokenize().unwrap_err();
1081 assert!(err.to_string().contains("Unterminated bytes literal"), "Error: {}", err);
1082 }
1083
1084 #[test]
1085 fn test_bytes_literal_does_not_conflict_with_word() {
1086 let mut lexer = Lexer::new("bar baz");
1088 let tokens = lexer.tokenize().unwrap();
1089 assert!(matches!(&tokens[0].kind, TokenKind::Word(w) if w == "bar"));
1090 assert!(matches!(&tokens[1].kind, TokenKind::Word(w) if w == "baz"));
1091 }
1092
1093 #[test]
1098 fn test_fuzz_crash_unknown_chars_no_stack_overflow() {
1099 let input = "\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{3}#\"0B\u{10}\u{3}#\"0BP\u{07FE}-----\u{061D}\u{07FE}\u{07FE}-----\u{061D}\u{07FE}";
1103 let mut lexer = Lexer::new(input);
1104 let _ = lexer.tokenize();
1106 }
1107
1108 #[test]
1109 fn test_fuzz_crash_timestamp_non_ascii_date() {
1110 let input = "02)3313-32-$\u{04A2}\u{1}\0\05";
1115 let mut lexer = Lexer::new(input);
1116 let _ = lexer.tokenize();
1117 }
1118
1119 #[test]
1120 fn test_fuzz_crash_backslash_timestamp_non_ascii() {
1121 let input = "\\\\\u{1}\0\0\n\\\\\\\\\\\\)3313-32-\\\u{052D}\u{052D}:{Y:{Y\\\\\\\\\\\\\\\\\\\\\\3m\u{00AC}m\u{00C2}5\0\05";
1124 let mut lexer = Lexer::new(input);
1125 let _ = lexer.tokenize();
1126 }
1127
1128 #[test]
1129 fn test_fuzz_crash_large_repeated_date_pattern() {
1130 let input = "\"18]\")\"\"\" ]\t;=1] ] 3333-333-3332)3313-33--33331333-333313T33302)3313-333-3333)3313-333-333-3332)33-133-3-333313;-3333)3333313T33302)3313-333-3333)3313-33332)33-3333)3333313T33302)3313-333-3333)3313-333-333-323)33-\t\n\t313T33302)3333-333-3332)3313-33--33331333-333313T33302)";
1134 let mut lexer = Lexer::new(input);
1135 let _ = lexer.tokenize();
1136 }
1137
1138 #[test]
1139 fn test_fuzz_parse_iso8601_non_ascii_rejected() {
1140 assert!(parse_iso8601("2024-01-15T10:30:00Z").is_ok());
1142 assert!(parse_iso8601("3313-32-$\u{04A2}").is_err());
1143 assert!(parse_iso8601("2024-01-\u{052D}5").is_err());
1144 assert!(parse_iso8601("").is_err());
1145 assert!(parse_iso8601("short").is_err());
1146 assert!(parse_iso8601("2024-00-15T10:30:00Z").is_err());
1148 assert!(parse_iso8601("2024-01-00T10:30:00Z").is_err());
1149 assert!(parse_iso8601("2024-13-15T10:30:00Z").is_err());
1150 assert!(parse_iso8601("2024-00-00T10:30:00Z").is_err());
1151 }
1152
1153 #[test]
1154 fn test_fuzz_timestamp_trailing_dot() {
1155 let mut lexer = Lexer::new("2024-01-15T10:30:00.Z");
1158 let result = lexer.tokenize();
1159 assert!(result.is_err());
1160 }
1161
1162 #[test]
1163 fn test_fuzz_crash_timestamp_long_fractional_no_overflow() {
1164 let _ = parse_iso8601("3230-32-33T33016656.6563311111111111111112");
1169 let result = parse_iso8601("2024-01-15T10:30:00.123456789012345678901234567890Z");
1171 assert!(result.is_ok());
1172 assert_eq!(result.unwrap().0 % 1000, 123);
1174 }
1175
1176 #[test]
1177 fn test_fuzz_crash_bc25426e_full_parse_no_panic() {
1178 let input = "\x00\x00\x00\x00\x00\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x001\x00\x00O\x00\x00\x00\x00\x00\x00\x00\x00\x0030-3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x003232,\x00\x00\x00111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.6563311111111111111112\x00\n\x00\x00\x00\x00\x00\x003,3230-32-33T33016656.65633111111111113323!:g";
1180 let _ = crate::TeaLeaf::parse(input); }
1182
1183 #[test]
1184 fn test_fuzz_crash_multiline_multibyte_whitespace_dedent() {
1185 let input = "*\0\"\"\"\u{0B}J\n\n\n\u{0085}\u{0B}J\n\n\n\n\n\n\n\n\"\"\" \0\n\n\n\n\n\"\"\" \0\0";
1190 let mut lexer = Lexer::new(input);
1191 let _ = lexer.tokenize(); }
1193
1194 #[test]
1195 fn test_multiline_string_multibyte_indent() {
1196 let input = "\"\"\"\n\u{0085}A\n\u{0B}B\n\"\"\"";
1199 let mut lexer = Lexer::new(input);
1200 let tokens = lexer.tokenize().unwrap();
1201 match &tokens[0].kind {
1202 TokenKind::String(s) => {
1203 assert_eq!(s, "A\nB", "Both lines should be dedented by 1 character");
1204 }
1205 other => panic!("Expected String, got {:?}", other),
1206 }
1207 }
1208
1209 #[test]
1210 fn test_many_unknown_chars_no_stack_overflow() {
1211 let input: String = std::iter::repeat('\u{07FE}').take(10_000).collect();
1213 let mut lexer = Lexer::new(&input);
1214 let tokens = lexer.tokenize().unwrap();
1215 assert_eq!(tokens.len(), 1);
1217 assert!(matches!(tokens[0].kind, TokenKind::Eof));
1218 }
1219}