1use crate::compat::{fmt, String, ToString, Vec};
3
4#[derive(Debug, Clone, PartialEq)]
6pub struct SourcePos {
7 pub line: usize,
8 pub column: usize,
9 pub offset: usize, }
11
12impl SourcePos {
13 pub fn new(line: usize, column: usize, offset: usize) -> Self {
14 Self {
15 line,
16 column,
17 offset,
18 }
19 }
20}
21
22#[derive(Debug, Clone, PartialEq)]
24pub struct Token {
25 pub kind: TokenKind,
26 pub pos: SourcePos,
27 pub end_pos: SourcePos,
28}
29
30impl Token {
31 pub fn new(kind: TokenKind, pos: SourcePos, end_pos: SourcePos) -> Self {
32 Self { kind, pos, end_pos }
33 }
34
35 }
45
46#[derive(Debug, Clone, PartialEq)]
47pub enum TokenKind {
48 Number(f64), Integer(String), BigInt(String), Rational(String, String), #[cfg(feature = "complex_numbers")]
53 GaussianInt(String, String), #[cfg(feature = "complex_numbers")]
55 Complex(String, String), Atom(String),
57 String(String), Boolean(bool), Null, LeftBracket,
61 ArrayLeftBracket,
62 RightBracket,
63 Quote,
64 Pipe, }
66
67impl fmt::Display for TokenKind {
68 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69 match self {
70 TokenKind::Number(n) => write!(f, "{}", n),
71 TokenKind::Integer(s) => write!(f, "{}", s),
72 TokenKind::BigInt(s) => write!(f, "{}n", s),
73 TokenKind::Rational(n, d) => write!(f, "{}/{}", n, d),
74 #[cfg(feature = "complex_numbers")]
75 TokenKind::GaussianInt(re, im) => write!(f, "{}+{}i", re, im),
76 #[cfg(feature = "complex_numbers")]
77 TokenKind::Complex(re, im) => write!(f, "{}+{}i", re, im),
78 TokenKind::Atom(s) => write!(f, "{}", s),
79 TokenKind::String(s) => write!(f, "\"{}\"", s),
80 TokenKind::Boolean(b) => write!(f, "{}", if *b { "true" } else { "false" }),
81 TokenKind::Null => write!(f, "null"),
82 TokenKind::LeftBracket => write!(f, "["),
83 TokenKind::ArrayLeftBracket => write!(f, "#["),
84 TokenKind::RightBracket => write!(f, "]"),
85 TokenKind::Quote => write!(f, "'"),
86 TokenKind::Pipe => write!(f, "|"),
87 }
88 }
89}
90
91impl fmt::Display for Token {
92 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93 write!(f, "{}", self.kind)
94 }
95}
96
97pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
98 let mut tokens = Vec::new();
99 let mut chars = input.chars().peekable();
100 let mut line = 1;
101 let mut column = 1;
102 let mut offset = 0;
103
104 fn advance_pos(ch: char, line: &mut usize, column: &mut usize, offset: &mut usize) {
106 if ch == '\n' {
107 *line += 1;
108 *column = 1;
109 } else {
110 *column += 1;
111 }
112 *offset += ch.len_utf8();
113 }
114
115 fn classify_atom(s: String) -> TokenKind {
117 if s.ends_with('n') && s.len() > 1 {
119 let num_part = &s[..s.len() - 1];
120 let is_integer = if let Some(stripped) = num_part.strip_prefix('-') {
122 num_part.len() > 1 && stripped.chars().all(|c| c.is_ascii_digit())
123 } else {
124 num_part.chars().all(|c| c.is_ascii_digit())
125 };
126
127 if is_integer {
128 return TokenKind::BigInt(num_part.to_string());
129 }
130 }
131
132 if s.contains('/') {
134 let parts: Vec<&str> = s.split('/').collect();
135 if parts.len() == 2
136 && let (Ok(_), Ok(_)) = (parts[0].parse::<i64>(), parts[1].parse::<i64>())
137 {
138 return TokenKind::Rational(parts[0].to_string(), parts[1].to_string());
139 }
140 }
141
142 #[cfg(feature = "complex_numbers")]
144 if s.ends_with('i') && s.len() > 1 {
145 let num_part = &s[..s.len() - 1];
146 if let Some(op_pos) = num_part.char_indices().skip(1).find(|(_, c)| *c == '+' || *c == '-').map(|(pos, _)| pos) {
148 let real_part = &num_part[..op_pos];
149 let imag_part = &num_part[op_pos..];
150
151 if let (Ok(_), Ok(_)) = (real_part.parse::<i64>(), imag_part.parse::<i64>()) {
153 return TokenKind::GaussianInt(real_part.to_string(), imag_part.to_string());
154 }
155
156 if real_part.parse::<f64>().is_ok() && imag_part.parse::<f64>().is_ok() {
158 return TokenKind::Complex(real_part.to_string(), imag_part.to_string());
159 }
160 } else {
161 if num_part.parse::<i64>().is_ok() {
164 return TokenKind::GaussianInt("0".to_string(), num_part.to_string());
165 }
166 if num_part.parse::<f64>().is_ok() {
168 return TokenKind::Complex("0".to_string(), num_part.to_string());
169 }
170 }
171 }
172
173 TokenKind::Atom(s)
175 }
176
177 while let Some(&ch) = chars.peek() {
178 let start_line = line;
179 let start_column = column;
180 let start_offset = offset;
181
182 match ch {
183 ' ' | '\t' | '\n' | '\r' => {
184 let consumed = chars.next().unwrap();
185 advance_pos(consumed, &mut line, &mut column, &mut offset);
186 }
187
188 '[' => {
189 let consumed = chars.next().unwrap();
190 advance_pos(consumed, &mut line, &mut column, &mut offset);
191 tokens.push(Token::new(
192 TokenKind::LeftBracket,
193 SourcePos::new(start_line, start_column, start_offset),
194 SourcePos::new(line, column, offset),
195 ));
196 }
197
198 '#' => {
199 let consumed = chars.next().unwrap();
200 advance_pos(consumed, &mut line, &mut column, &mut offset);
201
202 match chars.peek() {
203 Some(&'[') => {
204 let consumed_bracket = chars.next().unwrap();
205 advance_pos(consumed_bracket, &mut line, &mut column, &mut offset);
206 tokens.push(Token::new(
207 TokenKind::ArrayLeftBracket,
208 SourcePos::new(start_line, start_column, start_offset),
209 SourcePos::new(line, column, offset),
210 ));
211 }
212 Some(_) => {
213 let mut atom = String::from("#");
214 while let Some(&ch) = chars.peek() {
215 if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
216 break;
217 }
218 atom.push(ch);
219 let consumed = chars.next().unwrap();
220 advance_pos(consumed, &mut line, &mut column, &mut offset);
221 }
222 tokens.push(Token::new(
223 classify_atom(atom),
224 SourcePos::new(start_line, start_column, start_offset),
225 SourcePos::new(line, column, offset),
226 ));
227 }
228 None => {
229 tokens.push(Token::new(
230 TokenKind::Atom("#".to_string()),
231 SourcePos::new(start_line, start_column, start_offset),
232 SourcePos::new(line, column, offset),
233 ));
234 }
235 }
236 }
237
238 ']' => {
239 let consumed = chars.next().unwrap();
240 advance_pos(consumed, &mut line, &mut column, &mut offset);
241 tokens.push(Token::new(
242 TokenKind::RightBracket,
243 SourcePos::new(start_line, start_column, start_offset),
244 SourcePos::new(line, column, offset),
245 ));
246 }
247
248 '\'' => {
249 let consumed = chars.next().unwrap();
250 advance_pos(consumed, &mut line, &mut column, &mut offset);
251 tokens.push(Token::new(
252 TokenKind::Quote,
253 SourcePos::new(start_line, start_column, start_offset),
254 SourcePos::new(line, column, offset),
255 ));
256 }
257
258 '\\' => {
259 let consumed = chars.next().unwrap(); advance_pos(consumed, &mut line, &mut column, &mut offset);
262 for ch in chars.by_ref() {
263 advance_pos(ch, &mut line, &mut column, &mut offset);
264 if ch == '\n' {
265 break;
266 }
267 }
268 }
270
271 '|' => {
272 let consumed = chars.next().unwrap();
273 advance_pos(consumed, &mut line, &mut column, &mut offset);
274 tokens.push(Token::new(
275 TokenKind::Pipe,
276 SourcePos::new(start_line, start_column, start_offset),
277 SourcePos::new(line, column, offset),
278 ));
279 }
280
281 '"' => {
282 let consumed = chars.next().unwrap();
283 advance_pos(consumed, &mut line, &mut column, &mut offset);
284 let mut string = String::new();
285 let mut escaped = false;
286
287 for ch in chars.by_ref() {
288 advance_pos(ch, &mut line, &mut column, &mut offset);
289 if escaped {
290 match ch {
291 'n' => string.push('\n'),
292 't' => string.push('\t'),
293 '\\' => string.push('\\'),
294 '"' => string.push('"'),
295 _ => {
296 string.push('\\');
297 string.push(ch);
298 }
299 }
300 escaped = false;
301 } else if ch == '\\' {
302 escaped = true;
303 } else if ch == '"' {
304 break;
305 } else {
306 string.push(ch);
307 }
308 }
309
310 tokens.push(Token::new(
311 TokenKind::String(string),
312 SourcePos::new(start_line, start_column, start_offset),
313 SourcePos::new(line, column, offset),
314 ));
315 }
316
317 '+' | '-' if chars.clone().nth(1).is_some_and(|c| c.is_ascii_digit()) => {
318 let mut num_str = String::new();
320 num_str.push(ch);
321 let consumed = chars.next().unwrap();
322 advance_pos(consumed, &mut line, &mut column, &mut offset);
323
324 while let Some(&ch) = chars.peek() {
325 if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' {
326 num_str.push(ch);
327 let consumed = chars.next().unwrap();
328 advance_pos(consumed, &mut line, &mut column, &mut offset);
329 if (ch == 'e' || ch == 'E')
331 && chars.peek().is_some_and(|&c| c == '+' || c == '-')
332 {
333 let sign = chars.next().unwrap();
334 num_str.push(sign);
335 advance_pos(sign, &mut line, &mut column, &mut offset);
336 }
337 } else {
338 break;
339 }
340 }
341
342 let has_suffix = chars.peek().is_some_and(|&c| {
344 c == 'n' || c == 'i' || c == '/' || c == '+' || c == '-' });
349
350 if has_suffix {
351 while let Some(&ch) = chars.peek() {
354 if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
355 break;
356 }
357 num_str.push(ch);
358 let consumed = chars.next().unwrap();
359 advance_pos(consumed, &mut line, &mut column, &mut offset);
360 }
361 tokens.push(Token::new(
362 classify_atom(num_str),
363 SourcePos::new(start_line, start_column, start_offset),
364 SourcePos::new(line, column, offset),
365 ));
366 } else {
367 let has_decimal = num_str.contains('.') || num_str.contains('e') || num_str.contains('E');
369
370 if !has_decimal {
371 tokens.push(Token::new(
373 TokenKind::Integer(num_str),
374 SourcePos::new(start_line, start_column, start_offset),
375 SourcePos::new(line, column, offset),
376 ));
377 } else {
378 match num_str.parse::<f64>() {
380 Ok(num) => tokens.push(Token::new(
381 TokenKind::Number(num),
382 SourcePos::new(start_line, start_column, start_offset),
383 SourcePos::new(line, column, offset),
384 )),
385 Err(_) => {
386 while let Some(&ch) = chars.peek() {
389 if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
390 break;
391 }
392 num_str.push(ch);
393 let consumed = chars.next().unwrap();
394 advance_pos(consumed, &mut line, &mut column, &mut offset);
395 }
396 tokens.push(Token::new(
397 TokenKind::Atom(num_str),
398 SourcePos::new(start_line, start_column, start_offset),
399 SourcePos::new(line, column, offset),
400 ));
401 }
402 }
403 }
404 }
405 }
406
407 '0'..='9' => {
408 let mut num_str = String::new();
409
410 while let Some(&ch) = chars.peek() {
411 if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' {
412 num_str.push(ch);
413 let consumed = chars.next().unwrap();
414 advance_pos(consumed, &mut line, &mut column, &mut offset);
415 if (ch == 'e' || ch == 'E')
417 && chars.peek().is_some_and(|&c| c == '+' || c == '-')
418 {
419 let sign = chars.next().unwrap();
420 num_str.push(sign);
421 advance_pos(sign, &mut line, &mut column, &mut offset);
422 }
423 } else {
424 break;
425 }
426 }
427
428 let has_suffix = chars.peek().is_some_and(|&c| {
432 c == 'n' || c == 'i' || c == '/' || c == '+' || c == '-' });
437
438 if has_suffix {
439 while let Some(&ch) = chars.peek() {
442 if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
443 break;
444 }
445 num_str.push(ch);
446 let consumed = chars.next().unwrap();
447 advance_pos(consumed, &mut line, &mut column, &mut offset);
448 }
449 tokens.push(Token::new(
450 classify_atom(num_str),
451 SourcePos::new(start_line, start_column, start_offset),
452 SourcePos::new(line, column, offset),
453 ));
454 } else {
455 let has_decimal = num_str.contains('.') || num_str.contains('e') || num_str.contains('E');
458
459 if !has_decimal {
460 tokens.push(Token::new(
462 TokenKind::Integer(num_str),
463 SourcePos::new(start_line, start_column, start_offset),
464 SourcePos::new(line, column, offset),
465 ));
466 } else {
467 match num_str.parse::<f64>() {
469 Ok(num) => tokens.push(Token::new(
470 TokenKind::Number(num),
471 SourcePos::new(start_line, start_column, start_offset),
472 SourcePos::new(line, column, offset),
473 )),
474 Err(_) => {
475 while let Some(&ch) = chars.peek() {
478 if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
479 break;
480 }
481 num_str.push(ch);
482 let consumed = chars.next().unwrap();
483 advance_pos(consumed, &mut line, &mut column, &mut offset);
484 }
485 tokens.push(Token::new(
486 TokenKind::Atom(num_str),
487 SourcePos::new(start_line, start_column, start_offset),
488 SourcePos::new(line, column, offset),
489 ));
490 }
491 }
492 }
493 }
494 }
495
496 _ => {
497 let mut atom = String::new();
498
499 while let Some(&ch) = chars.peek() {
500 if ch.is_whitespace() || "[]|\\'\"\\\\".contains(ch) {
501 break;
502 }
503 atom.push(ch);
504 let consumed = chars.next().unwrap();
505 advance_pos(consumed, &mut line, &mut column, &mut offset);
506 }
507
508 if !atom.is_empty() {
509 let token_kind = match atom.as_str() {
512 "true" => TokenKind::Boolean(true),
513 "false" => TokenKind::Boolean(false),
514 "null" => TokenKind::Null,
515 _ => TokenKind::Atom(atom),
516 };
517 tokens.push(Token::new(
518 token_kind,
519 SourcePos::new(start_line, start_column, start_offset),
520 SourcePos::new(line, column, offset),
521 ));
522 }
523 }
524 }
525 }
526
527 Ok(tokens)
528}
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533
534 #[test]
535 fn test_tokenize_numbers() {
536 let tokens = tokenize("42").unwrap();
538 assert_eq!(tokens.len(), 1);
539 assert!(matches!(&tokens[0].kind, TokenKind::Integer(s) if s == "42"));
540 assert_eq!(tokens[0].pos.line, 1);
541 assert_eq!(tokens[0].pos.column, 1);
542
543 let tokens = tokenize("3.14").unwrap();
545 assert_eq!(tokens.len(), 1);
546 assert!(matches!(tokens[0].kind, TokenKind::Number(n) if n == 3.14));
547
548 let tokens = tokenize("-17").unwrap();
550 assert_eq!(tokens.len(), 1);
551 assert!(matches!(&tokens[0].kind, TokenKind::Integer(s) if s == "-17"));
552 }
553
554 #[test]
555 fn test_tokenize_atoms() {
556 let tokens = tokenize("hello").unwrap();
557 assert_eq!(tokens.len(), 1);
558 assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "hello"));
559
560 let tokens = tokenize("+").unwrap();
561 assert_eq!(tokens.len(), 1);
562 assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "+"));
563 }
564
565 #[test]
566 fn test_tokenize_strings() {
567 let tokens = tokenize("\"hello world\"").unwrap();
568 assert_eq!(tokens.len(), 1);
569 assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello world"));
570 }
571
572 #[test]
573 fn test_tokenize_brackets() {
574 let tokens = tokenize("[1 2 3]").unwrap();
575 assert_eq!(tokens.len(), 5);
576 assert!(matches!(tokens[0].kind, TokenKind::LeftBracket));
577 assert!(matches!(&tokens[1].kind, TokenKind::Integer(s) if s == "1"));
578 assert!(matches!(&tokens[2].kind, TokenKind::Integer(s) if s == "2"));
579 assert!(matches!(&tokens[3].kind, TokenKind::Integer(s) if s == "3"));
580 assert!(matches!(tokens[4].kind, TokenKind::RightBracket));
581 }
582
583 #[test]
584 fn test_tokenize_array_literals() {
585 let tokens = tokenize("#[1 2]").unwrap();
586 assert_eq!(tokens.len(), 4);
587 assert!(matches!(tokens[0].kind, TokenKind::ArrayLeftBracket));
588 assert!(matches!(&tokens[1].kind, TokenKind::Integer(s) if s == "1"));
589 assert!(matches!(&tokens[2].kind, TokenKind::Integer(s) if s == "2"));
590 assert!(matches!(tokens[3].kind, TokenKind::RightBracket));
591 }
592
593 #[test]
594 fn test_tokenize_position_tracking() {
595 let tokens = tokenize("hello\nworld").unwrap();
596 assert_eq!(tokens.len(), 2);
597
598 assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "hello"));
600 assert_eq!(tokens[0].pos.line, 1);
601 assert_eq!(tokens[0].pos.column, 1);
602
603 assert!(matches!(&tokens[1].kind, TokenKind::Atom(s) if s == "world"));
605 assert_eq!(tokens[1].pos.line, 2);
606 assert_eq!(tokens[1].pos.column, 1);
607 }
608
609 #[test]
610 fn test_tokenize_booleans_and_null() {
611 let tokens = tokenize("true false null").unwrap();
612 assert_eq!(tokens.len(), 3);
613 assert!(matches!(tokens[0].kind, TokenKind::Boolean(true)));
614 assert!(matches!(tokens[1].kind, TokenKind::Boolean(false)));
615 assert!(matches!(tokens[2].kind, TokenKind::Null));
616 }
617
618 #[test]
619 fn test_simple_token_helper() {
620 let pos = SourcePos::new(1, 1, 0);
621 let token = Token::new(TokenKind::Number(42.0), pos.clone(), pos);
622 assert!(matches!(token.kind, TokenKind::Number(n) if n == 42.0));
623 assert_eq!(token.pos.line, 1);
624 assert_eq!(token.pos.column, 1);
625 }
626}