1use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50 Indent,
52 Dedent,
54 Newline,
56 Content { text: String, start: usize, end: usize },
58}
59
60pub struct LineLexer<'a> {
63 source: &'a str,
64 bytes: &'a [u8],
65 indent_stack: Vec<usize>,
66 pending_dedents: usize,
67 position: usize,
68 has_pending_content: bool,
70 pending_content_start: usize,
71 pending_content_end: usize,
72 pending_content_text: String,
73 finished_lines: bool,
75 emitted_indent: bool,
77 escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82 pub fn new(source: &'a str) -> Self {
83 Self {
84 source,
85 bytes: source.as_bytes(),
86 indent_stack: vec![0],
87 pending_dedents: 0,
88 position: 0,
89 has_pending_content: false,
90 pending_content_start: 0,
91 pending_content_end: 0,
92 pending_content_text: String::new(),
93 finished_lines: false,
94 emitted_indent: false,
95 escape_body_ranges: Vec::new(),
96 }
97 }
98
99 pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100 Self {
101 source,
102 bytes: source.as_bytes(),
103 indent_stack: vec![0],
104 pending_dedents: 0,
105 position: 0,
106 has_pending_content: false,
107 pending_content_start: 0,
108 pending_content_end: 0,
109 pending_content_text: String::new(),
110 finished_lines: false,
111 emitted_indent: false,
112 escape_body_ranges,
113 }
114 }
115
116 fn is_in_escape_body(&self, pos: usize) -> bool {
118 self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119 }
120
121 fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124 let mut indent = 0;
125 let mut pos = line_start;
126
127 while pos < self.bytes.len() {
128 match self.bytes[pos] {
129 b' ' => {
130 indent += 1;
131 pos += 1;
132 }
133 b'\t' => {
134 indent += 4; pos += 1;
136 }
137 _ => break,
138 }
139 }
140
141 (indent, pos)
142 }
143
144 fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147 let mut pos = content_start;
148
149 while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151 pos += 1;
152 }
153
154 let content_end = pos;
155 let text = self.source[content_start..content_end].trim_end().to_string();
156
157 let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159 pos + 1
160 } else {
161 pos
162 };
163
164 (text, content_start, content_end, next_line_start)
165 }
166
167 fn is_blank_line(&self, line_start: usize) -> bool {
169 let mut pos = line_start;
170 while pos < self.bytes.len() {
171 match self.bytes[pos] {
172 b' ' | b'\t' => pos += 1,
173 b'\n' => return true,
174 _ => return false,
175 }
176 }
177 true }
179
180 fn process_next_line(&mut self) -> bool {
183 while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185 while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187 self.position += 1;
188 }
189 if self.position < self.bytes.len() {
190 self.position += 1; }
192 }
193
194 if self.position >= self.bytes.len() {
196 self.finished_lines = true;
197 if self.indent_stack.len() > 1 {
199 self.pending_dedents = self.indent_stack.len() - 1;
200 self.indent_stack.truncate(1);
201 }
202 return self.pending_dedents > 0;
203 }
204
205 let (line_indent, content_start) = self.measure_indent(self.position);
207
208 let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211 if text.is_empty() {
213 self.position = next_pos;
214 return self.process_next_line();
215 }
216
217 let current_indent = *self.indent_stack.last().unwrap();
218
219 if line_indent > current_indent {
221 self.indent_stack.push(line_indent);
223 self.emitted_indent = true;
224 self.has_pending_content = true;
226 self.pending_content_text = text;
227 self.pending_content_start = start;
228 self.pending_content_end = end;
229 self.position = next_pos;
230 return true;
232 } else if line_indent < current_indent {
233 while self.indent_stack.len() > 1 {
235 let top = *self.indent_stack.last().unwrap();
236 if line_indent < top {
237 self.indent_stack.pop();
238 self.pending_dedents += 1;
239 } else {
240 break;
241 }
242 }
243 self.has_pending_content = true;
245 self.pending_content_text = text;
246 self.pending_content_start = start;
247 self.pending_content_end = end;
248 self.position = next_pos;
249 return true;
250 } else {
251 self.has_pending_content = true;
253 self.pending_content_text = text;
254 self.pending_content_start = start;
255 self.pending_content_end = end;
256 self.position = next_pos;
257 return true;
258 }
259 }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263 type Item = LineToken;
264
265 fn next(&mut self) -> Option<LineToken> {
266 if self.pending_dedents > 0 {
268 self.pending_dedents -= 1;
269 return Some(LineToken::Dedent);
270 }
271
272 if self.has_pending_content {
274 self.has_pending_content = false;
275 let text = std::mem::take(&mut self.pending_content_text);
276 let start = self.pending_content_start;
277 let end = self.pending_content_end;
278 return Some(LineToken::Content { text, start, end });
279 }
280
281 if !self.finished_lines {
287 let had_indent = self.indent_stack.len();
288 if self.process_next_line() {
289 if self.indent_stack.len() > had_indent {
291 return Some(LineToken::Indent);
292 }
293 if self.pending_dedents > 0 {
295 self.pending_dedents -= 1;
296 return Some(LineToken::Dedent);
297 }
298 if self.has_pending_content {
300 self.has_pending_content = false;
301 let text = std::mem::take(&mut self.pending_content_text);
302 let start = self.pending_content_start;
303 let end = self.pending_content_end;
304 return Some(LineToken::Content { text, start, end });
305 }
306 } else if self.pending_dedents > 0 {
307 self.pending_dedents -= 1;
309 return Some(LineToken::Dedent);
310 }
311 }
312
313 if self.pending_dedents > 0 {
315 self.pending_dedents -= 1;
316 return Some(LineToken::Dedent);
317 }
318
319 None
320 }
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329 #[default]
330 Declarative, Imperative, }
333
334pub struct Lexer<'a> {
335 words: Vec<WordItem>,
336 pos: usize,
337 lexicon: Lexicon,
338 interner: &'a mut Interner,
339 input_len: usize,
340 in_let_context: bool,
341 mode: LexerMode,
342 source: String,
343 escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348 word: String,
349 trailing_punct: Option<char>,
350 start: usize,
351 end: usize,
352 punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356 pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380 let escape_ranges = Self::find_escape_block_ranges(input);
381 let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382 .map(|(_, end, content_start, _)| (*content_start, *end))
383 .collect();
384 let words = Self::split_into_words(input, &escape_ranges);
385 let input_len = input.len();
386
387 Lexer {
388 words,
389 pos: 0,
390 lexicon: Lexicon::new(),
391 interner,
392 input_len,
393 in_let_context: false,
394 mode: LexerMode::Declarative,
395 source: input.to_string(),
396 escape_body_ranges,
397 }
398 }
399
400 fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405 let mut ranges = Vec::new();
406 let lines: Vec<&str> = source.split('\n').collect();
407 let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408 let mut pos = 0;
409 for line in &lines {
410 line_starts.push(pos);
411 pos += line.len() + 1; }
413
414 let mut i = 0;
415 while i < lines.len() {
416 let trimmed = lines[i].trim();
417 let lower = trimmed.to_lowercase();
421 if lower == "escape to rust:" ||
422 lower.ends_with(" escape to rust:") ||
423 (lower.starts_with("escape to ") && lower.ends_with(':'))
424 {
425 let header_indent = Self::measure_indent_static(lines[i]);
427 i += 1;
428
429 let mut body_start_line = i;
431 while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432 body_start_line += 1;
433 }
434
435 if body_start_line >= lines.len() {
436 continue;
438 }
439
440 let base_indent = Self::measure_indent_static(lines[body_start_line]);
441 if base_indent <= header_indent {
442 continue;
444 }
445
446 let body_byte_start = line_starts[body_start_line];
448 let mut body_end_line = body_start_line;
449 let mut code_lines: Vec<String> = Vec::new();
450
451 let mut j = body_start_line;
452 while j < lines.len() {
453 let line = lines[j];
454 if line.trim().is_empty() {
455 code_lines.push(String::new());
457 body_end_line = j;
458 j += 1;
459 continue;
460 }
461 let line_indent = Self::measure_indent_static(line);
462 if line_indent < base_indent {
463 break;
464 }
465 let stripped = Self::strip_indent(line, base_indent);
467 code_lines.push(stripped);
468 body_end_line = j;
469 j += 1;
470 }
471
472 while code_lines.last().map_or(false, |l| l.is_empty()) {
474 code_lines.pop();
475 }
476
477 if !code_lines.is_empty() {
478 let body_byte_end = if body_end_line + 1 < lines.len() {
479 line_starts[body_end_line + 1]
480 } else {
481 source.len()
482 };
483 let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485 let raw_code = code_lines.join("\n");
486 ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487 }
488
489 i = j;
490 } else {
491 i += 1;
492 }
493 }
494
495 ranges
496 }
497
498 fn leading_whitespace_bytes(line: &str) -> usize {
500 let mut count = 0;
501 for c in line.chars() {
502 match c {
503 ' ' | '\t' => count += c.len_utf8(),
504 _ => break,
505 }
506 }
507 count
508 }
509
510 fn measure_indent_static(line: &str) -> usize {
512 let mut indent = 0;
513 for c in line.chars() {
514 match c {
515 ' ' => indent += 1,
516 '\t' => indent += 4,
517 _ => break,
518 }
519 }
520 indent
521 }
522
523 fn strip_indent(line: &str, count: usize) -> String {
525 let mut stripped = 0;
526 let mut byte_pos = 0;
527 for (i, c) in line.char_indices() {
528 if stripped >= count {
529 byte_pos = i;
530 break;
531 }
532 match c {
533 ' ' => { stripped += 1; byte_pos = i + 1; }
534 '\t' => { stripped += 4; byte_pos = i + 1; }
535 _ => { byte_pos = i; break; }
536 }
537 }
538 if stripped < count {
539 byte_pos = line.len();
540 }
541 line[byte_pos..].to_string()
542 }
543
544 fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545 let mut items = Vec::new();
546 let mut current_word = String::new();
547 let mut word_start = 0;
548 let chars: Vec<char> = input.chars().collect();
549 let mut char_idx = 0;
550 let mut skip_count = 0;
551 let mut skip_to_byte: Option<usize> = None;
553
554 for (i, c) in input.char_indices() {
555 if skip_count > 0 {
556 skip_count -= 1;
557 char_idx += 1;
558 continue;
559 }
560 if let Some(end) = skip_to_byte {
562 if i < end {
563 char_idx += 1;
564 continue;
565 }
566 skip_to_byte = None;
567 word_start = i;
568 }
569 if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571 if !current_word.is_empty() {
573 items.push(WordItem {
574 word: std::mem::take(&mut current_word),
575 trailing_punct: None,
576 start: word_start,
577 end: i,
578 punct_pos: None,
579 });
580 }
581 items.push(WordItem {
584 word: format!("\x00ESC:{}", raw_code),
585 trailing_punct: None,
586 start: *content_start,
587 end: *end,
588 punct_pos: None,
589 });
590 skip_to_byte = Some(*end);
591 word_start = *end;
592 char_idx += 1;
593 continue;
594 }
595 let next_pos = i + c.len_utf8();
596 match c {
597 ' ' | '\t' | '\n' | '\r' => {
598 if !current_word.is_empty() {
599 items.push(WordItem {
600 word: std::mem::take(&mut current_word),
601 trailing_punct: None,
602 start: word_start,
603 end: i,
604 punct_pos: None,
605 });
606 }
607 word_start = next_pos;
608 }
609 '.' => {
610 let prev_is_digit = !current_word.is_empty()
612 && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613 let next_is_digit = char_idx + 1 < chars.len()
614 && chars[char_idx + 1].is_ascii_digit();
615
616 if prev_is_digit && next_is_digit {
617 current_word.push(c);
619 } else {
620 if !current_word.is_empty() {
622 items.push(WordItem {
623 word: std::mem::take(&mut current_word),
624 trailing_punct: Some(c),
625 start: word_start,
626 end: i,
627 punct_pos: Some(i),
628 });
629 } else {
630 items.push(WordItem {
631 word: String::new(),
632 trailing_punct: Some(c),
633 start: i,
634 end: next_pos,
635 punct_pos: Some(i),
636 });
637 }
638 word_start = next_pos;
639 }
640 }
641 '#' => {
642 if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644 if !current_word.is_empty() {
647 items.push(WordItem {
648 word: std::mem::take(&mut current_word),
649 trailing_punct: None,
650 start: word_start,
651 end: i,
652 punct_pos: None,
653 });
654 }
655 let header_start = i;
657 let mut j = char_idx + 2;
658 while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659 j += 1;
660 }
661 let mut block_word = String::from("##");
663 while j < chars.len() && chars[j].is_alphabetic() {
664 block_word.push(chars[j]);
665 j += 1;
666 }
667 if block_word.len() > 2 {
668 items.push(WordItem {
669 word: block_word,
670 trailing_punct: None,
671 start: header_start,
672 end: header_start + (j - char_idx),
673 punct_pos: None,
674 });
675 }
676 skip_count = j - char_idx - 1;
677 word_start = header_start + (j - char_idx);
678 } else {
679 let mut look_ahead = char_idx + 1;
683 while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684 skip_count += 1;
685 look_ahead += 1;
686 }
687 if !current_word.is_empty() {
688 items.push(WordItem {
689 word: std::mem::take(&mut current_word),
690 trailing_punct: None,
691 start: word_start,
692 end: i,
693 punct_pos: None,
694 });
695 }
696 word_start = look_ahead + 1; }
698 }
699 '"' => {
701 if !current_word.is_empty() {
703 items.push(WordItem {
704 word: std::mem::take(&mut current_word),
705 trailing_punct: None,
706 start: word_start,
707 end: i,
708 punct_pos: None,
709 });
710 }
711
712 if char_idx + 2 < chars.len() && chars[char_idx + 1] == '"' && chars[char_idx + 2] == '"' {
714 let string_start = i;
715 let mut j = char_idx + 3; if j < chars.len() && chars[j] == '\n' {
718 j += 1;
719 }
720 let mut raw_content = String::new();
721 while j < chars.len() {
723 if j + 2 < chars.len() && chars[j] == '"' && chars[j + 1] == '"' && chars[j + 2] == '"' {
724 break;
725 }
726 raw_content.push(chars[j]);
727 j += 1;
728 }
729 if raw_content.ends_with('\n') {
731 raw_content.pop();
732 }
733 let dedented = Self::dedent_triple_quote(&raw_content);
735 let end_pos = if j + 2 < chars.len() { j + 3 } else { chars.len() };
736 items.push(WordItem {
737 word: format!("\x00STR:{}", dedented),
738 trailing_punct: None,
739 start: string_start,
740 end: end_pos,
741 punct_pos: None,
742 });
743 if j + 2 < chars.len() {
745 skip_count = (j + 2) - char_idx;
746 } else {
747 skip_count = chars.len() - 1 - char_idx;
748 }
749 word_start = end_pos;
750 } else {
751 let string_start = i;
753 let mut j = char_idx + 1;
754 let mut string_content = String::new();
755 while j < chars.len() && chars[j] != '"' {
756 if chars[j] == '\\' && j + 1 < chars.len() {
757 j += 1;
759 if j < chars.len() {
760 string_content.push(chars[j]);
761 }
762 } else {
763 string_content.push(chars[j]);
764 }
765 j += 1;
766 }
767
768 items.push(WordItem {
771 word: format!("\x00STR:{}", string_content),
772 trailing_punct: None,
773 start: string_start,
774 end: if j < chars.len() { j + 1 } else { j },
775 punct_pos: None,
776 });
777
778 if j < chars.len() {
780 skip_count = j - char_idx;
781 } else {
782 skip_count = j - char_idx - 1;
783 }
784 word_start = if j < chars.len() { j + 1 } else { j };
785 }
786 }
787 '`' => {
789 if !current_word.is_empty() {
791 items.push(WordItem {
792 word: std::mem::take(&mut current_word),
793 trailing_punct: None,
794 start: word_start,
795 end: i,
796 punct_pos: None,
797 });
798 }
799
800 let char_start = i;
802 let mut j = char_idx + 1;
803 let mut char_content = String::new();
804
805 if j < chars.len() {
806 if chars[j] == '\\' && j + 1 < chars.len() {
807 j += 1;
809 let escaped_char = match chars[j] {
810 'n' => '\n',
811 't' => '\t',
812 'r' => '\r',
813 '\\' => '\\',
814 '`' => '`',
815 '0' => '\0',
816 c => c,
817 };
818 char_content.push(escaped_char);
819 j += 1;
820 } else if chars[j] != '`' {
821 char_content.push(chars[j]);
823 j += 1;
824 }
825 }
826
827 if j < chars.len() && chars[j] == '`' {
829 j += 1; }
831
832 items.push(WordItem {
834 word: format!("\x00CHAR:{}", char_content),
835 trailing_punct: None,
836 start: char_start,
837 end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
838 punct_pos: None,
839 });
840
841 if j > char_idx + 1 {
842 skip_count = j - char_idx - 1;
843 }
844 word_start = char_start + (j - char_idx);
845 }
846 '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
848 if !current_word.is_empty() {
850 items.push(WordItem {
851 word: std::mem::take(&mut current_word),
852 trailing_punct: None,
853 start: word_start,
854 end: i,
855 punct_pos: None,
856 });
857 }
858 items.push(WordItem {
860 word: "->".to_string(),
861 trailing_punct: None,
862 start: i,
863 end: i + 2,
864 punct_pos: None,
865 });
866 skip_count = 1; word_start = i + 2;
868 }
869 '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
871 if !current_word.is_empty() {
872 items.push(WordItem {
873 word: std::mem::take(&mut current_word),
874 trailing_punct: None,
875 start: word_start,
876 end: i,
877 punct_pos: None,
878 });
879 }
880 items.push(WordItem {
881 word: "<=".to_string(),
882 trailing_punct: None,
883 start: i,
884 end: i + 2,
885 punct_pos: None,
886 });
887 skip_count = 1;
888 word_start = i + 2;
889 }
890 '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
892 if !current_word.is_empty() {
893 items.push(WordItem {
894 word: std::mem::take(&mut current_word),
895 trailing_punct: None,
896 start: word_start,
897 end: i,
898 punct_pos: None,
899 });
900 }
901 items.push(WordItem {
902 word: ">=".to_string(),
903 trailing_punct: None,
904 start: i,
905 end: i + 2,
906 punct_pos: None,
907 });
908 skip_count = 1;
909 word_start = i + 2;
910 }
911 '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
913 if !current_word.is_empty() {
914 items.push(WordItem {
915 word: std::mem::take(&mut current_word),
916 trailing_punct: None,
917 start: word_start,
918 end: i,
919 punct_pos: None,
920 });
921 }
922 items.push(WordItem {
923 word: "==".to_string(),
924 trailing_punct: None,
925 start: i,
926 end: i + 2,
927 punct_pos: None,
928 });
929 skip_count = 1;
930 word_start = i + 2;
931 }
932 '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
934 if !current_word.is_empty() {
935 items.push(WordItem {
936 word: std::mem::take(&mut current_word),
937 trailing_punct: None,
938 start: word_start,
939 end: i,
940 punct_pos: None,
941 });
942 }
943 items.push(WordItem {
944 word: "!=".to_string(),
945 trailing_punct: None,
946 start: i,
947 end: i + 2,
948 punct_pos: None,
949 });
950 skip_count = 1;
951 word_start = i + 2;
952 }
953 '-' if Self::is_date_hyphen(¤t_word, &chars, char_idx) => {
955 current_word.push(c);
957 }
958 ':' if Self::is_time_colon(¤t_word, &chars, char_idx) => {
960 current_word.push(c);
962 }
963 '+' | '-' if Self::is_exponent_sign(¤t_word, &chars, char_idx) => {
965 current_word.push(c);
966 }
967 '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
968 if !current_word.is_empty() {
969 items.push(WordItem {
970 word: std::mem::take(&mut current_word),
971 trailing_punct: Some(c),
972 start: word_start,
973 end: i,
974 punct_pos: Some(i),
975 });
976 } else {
977 items.push(WordItem {
978 word: String::new(),
979 trailing_punct: Some(c),
980 start: i,
981 end: next_pos,
982 punct_pos: Some(i),
983 });
984 }
985 word_start = next_pos;
986 }
987 '\'' => {
988 let remaining: String = chars[char_idx + 1..].iter().collect();
990 let remaining_lower = remaining.to_lowercase();
991
992 if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
993 remaining_lower.starts_with("t,") || remaining_lower == "t" ||
994 (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
995 (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
996 let word_lower = current_word.to_lowercase();
998 if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
999 let base = if word_lower == "don" { "do" }
1001 else if word_lower == "doesn" { "does" }
1002 else { "did" };
1003 items.push(WordItem {
1004 word: base.to_string(),
1005 trailing_punct: None,
1006 start: word_start,
1007 end: i,
1008 punct_pos: None,
1009 });
1010 items.push(WordItem {
1011 word: "not".to_string(),
1012 trailing_punct: None,
1013 start: i,
1014 end: i + 2,
1015 punct_pos: None,
1016 });
1017 current_word.clear();
1018 word_start = next_pos + 1;
1019 skip_count = 1;
1020 } else if word_lower == "won" {
1021 items.push(WordItem {
1023 word: "will".to_string(),
1024 trailing_punct: None,
1025 start: word_start,
1026 end: i,
1027 punct_pos: None,
1028 });
1029 items.push(WordItem {
1030 word: "not".to_string(),
1031 trailing_punct: None,
1032 start: i,
1033 end: i + 2,
1034 punct_pos: None,
1035 });
1036 current_word.clear();
1037 word_start = next_pos + 1;
1038 skip_count = 1;
1039 } else if word_lower == "can" {
1040 items.push(WordItem {
1042 word: "cannot".to_string(),
1043 trailing_punct: None,
1044 start: word_start,
1045 end: i + 2,
1046 punct_pos: None,
1047 });
1048 current_word.clear();
1049 word_start = next_pos + 1;
1050 skip_count = 1;
1051 } else {
1052 if !current_word.is_empty() {
1054 items.push(WordItem {
1055 word: std::mem::take(&mut current_word),
1056 trailing_punct: Some('\''),
1057 start: word_start,
1058 end: i,
1059 punct_pos: Some(i),
1060 });
1061 }
1062 word_start = next_pos;
1063 }
1064 } else {
1065 if !current_word.is_empty() {
1067 items.push(WordItem {
1068 word: std::mem::take(&mut current_word),
1069 trailing_punct: Some('\''),
1070 start: word_start,
1071 end: i,
1072 punct_pos: Some(i),
1073 });
1074 }
1075 word_start = next_pos;
1076 }
1077 }
1078 c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1079 if current_word.is_empty() {
1080 word_start = i;
1081 }
1082 current_word.push(c);
1083 }
1084 _ => {
1085 word_start = next_pos;
1086 }
1087 }
1088 char_idx += 1;
1089 }
1090
1091 if !current_word.is_empty() {
1092 items.push(WordItem {
1093 word: current_word,
1094 trailing_punct: None,
1095 start: word_start,
1096 end: input.len(),
1097 punct_pos: None,
1098 });
1099 }
1100
1101 items
1102 }
1103
1104 fn peek_word(&self, offset: usize) -> Option<&str> {
1105 self.words.get(self.pos + offset).map(|w| w.word.as_str())
1106 }
1107
1108 fn peek_sequence(&self, expected: &[&str]) -> bool {
1109 for (i, &exp) in expected.iter().enumerate() {
1110 match self.peek_word(i + 1) {
1111 Some(w) if w.to_lowercase() == exp => continue,
1112 _ => return false,
1113 }
1114 }
1115 true
1116 }
1117
1118 fn consume_words(&mut self, count: usize) {
1119 self.pos += count;
1120 }
1121
1122 pub fn tokenize(&mut self) -> Vec<Token> {
1133 let mut tokens = Vec::new();
1134
1135 while self.pos < self.words.len() {
1136 let item = &self.words[self.pos];
1137 let word = item.word.clone();
1138 let trailing_punct = item.trailing_punct;
1139 let word_start = item.start;
1140 let word_end = item.end;
1141 let punct_pos = item.punct_pos;
1142
1143 if word.is_empty() {
1144 if let Some(punct) = trailing_punct {
1145 let kind = match punct {
1146 '(' => TokenType::LParen,
1147 ')' => TokenType::RParen,
1148 '[' => TokenType::LBracket,
1149 ']' => TokenType::RBracket,
1150 ',' => TokenType::Comma,
1151 ':' => TokenType::Colon,
1152 '.' | '?' => {
1153 self.in_let_context = false;
1154 TokenType::Period
1155 }
1156 '!' => TokenType::Exclamation,
1157 '+' => TokenType::Plus,
1158 '-' => TokenType::Minus,
1159 '*' => TokenType::Star,
1160 '/' => TokenType::Slash,
1161 '%' => TokenType::Percent,
1162 '<' => TokenType::Lt,
1163 '>' => TokenType::Gt,
1164 '=' => TokenType::Assign,
1165 _ => {
1166 self.pos += 1;
1167 continue;
1168 }
1169 };
1170 let lexeme = self.interner.intern(&punct.to_string());
1171 let span = Span::new(word_start, word_end);
1172 tokens.push(Token::new(kind, lexeme, span));
1173 }
1174 self.pos += 1;
1175 continue;
1176 }
1177
1178 if word.starts_with("\x00STR:") {
1180 let content = &word[5..]; let span = Span::new(word_start, word_end);
1182 if Self::has_unescaped_brace(content) {
1183 let sym = self.interner.intern(content);
1184 tokens.push(Token::new(TokenType::InterpolatedString(sym), sym, span));
1185 } else {
1186 let normalized = content.replace("{{", "{").replace("}}", "}");
1188 let sym = self.interner.intern(&normalized);
1189 tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1190 }
1191 self.pos += 1;
1192 continue;
1193 }
1194
1195 if word.starts_with("\x00CHAR:") {
1197 let content = &word[6..]; let sym = self.interner.intern(content);
1199 let span = Span::new(word_start, word_end);
1200 tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1201 self.pos += 1;
1202 continue;
1203 }
1204
1205 if word.starts_with("\x00ESC:") {
1207 let content = &word[5..]; let sym = self.interner.intern(content);
1209 let span = Span::new(word_start, word_end);
1210 tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1211 self.pos += 1;
1212 continue;
1213 }
1214
1215 let kind = self.classify_with_lookahead(&word);
1216 let lexeme = self.interner.intern(&word);
1217 let span = Span::new(word_start, word_end);
1218 tokens.push(Token::new(kind, lexeme, span));
1219
1220 if let Some(punct) = trailing_punct {
1221 if punct == '\'' {
1222 if let Some(next_item) = self.words.get(self.pos + 1) {
1223 if next_item.word.to_lowercase() == "s" {
1224 let poss_lexeme = self.interner.intern("'s");
1225 let poss_start = punct_pos.unwrap_or(word_end);
1226 let poss_end = next_item.end;
1227 tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1228 self.pos += 1;
1229 if let Some(s_punct) = next_item.trailing_punct {
1230 let kind = match s_punct {
1231 '(' => TokenType::LParen,
1232 ')' => TokenType::RParen,
1233 '[' => TokenType::LBracket,
1234 ']' => TokenType::RBracket,
1235 ',' => TokenType::Comma,
1236 ':' => TokenType::Colon,
1237 '.' | '?' => TokenType::Period,
1238 '!' => TokenType::Exclamation,
1239 '+' => TokenType::Plus,
1240 '-' => TokenType::Minus,
1241 '*' => TokenType::Star,
1242 '/' => TokenType::Slash,
1243 '%' => TokenType::Percent,
1244 '<' => TokenType::Lt,
1245 '>' => TokenType::Gt,
1246 '=' => TokenType::Assign,
1247 _ => {
1248 self.pos += 1;
1249 continue;
1250 }
1251 };
1252 let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1253 let lexeme = self.interner.intern(&s_punct.to_string());
1254 tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1255 }
1256 self.pos += 1;
1257 continue;
1258 }
1259 }
1260 self.pos += 1;
1261 continue;
1262 }
1263
1264 let kind = match punct {
1265 '(' => TokenType::LParen,
1266 ')' => TokenType::RParen,
1267 '[' => TokenType::LBracket,
1268 ']' => TokenType::RBracket,
1269 ',' => TokenType::Comma,
1270 ':' => TokenType::Colon,
1271 '.' | '?' => {
1272 self.in_let_context = false;
1273 TokenType::Period
1274 }
1275 '!' => TokenType::Exclamation,
1276 '+' => TokenType::Plus,
1277 '-' => TokenType::Minus,
1278 '*' => TokenType::Star,
1279 '/' => TokenType::Slash,
1280 '%' => TokenType::Percent,
1281 '<' => TokenType::Lt,
1282 '>' => TokenType::Gt,
1283 '=' => TokenType::Assign,
1284 _ => {
1285 self.pos += 1;
1286 continue;
1287 }
1288 };
1289 let p_start = punct_pos.unwrap_or(word_end);
1290 let lexeme = self.interner.intern(&punct.to_string());
1291 tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1292 }
1293
1294 self.pos += 1;
1295 }
1296
1297 let eof_lexeme = self.interner.intern("");
1298 let eof_span = Span::new(self.input_len, self.input_len);
1299 tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1300
1301 self.insert_indentation_tokens(tokens)
1302 }
1303
1304 fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1309 let mut result = Vec::new();
1310 let empty_sym = self.interner.intern("");
1311
1312 let line_lexer = LineLexer::new(&self.source);
1314 let line_tokens: Vec<LineToken> = line_lexer.collect();
1315
1316 let mut structural_events: Vec<(usize, bool)> = Vec::new(); let mut pending_indents = 0usize;
1320 let mut pending_dedents = 0usize;
1321
1322 for line_token in &line_tokens {
1323 match line_token {
1324 LineToken::Indent => {
1325 pending_indents += 1;
1326 }
1327 LineToken::Dedent => {
1328 pending_dedents += 1;
1329 }
1330 LineToken::Content { start, .. } => {
1331 for _ in 0..pending_dedents {
1333 structural_events.push((*start, false)); }
1335 pending_dedents = 0;
1336
1337 for _ in 0..pending_indents {
1339 structural_events.push((*start, true)); }
1341 pending_indents = 0;
1342 }
1343 LineToken::Newline => {}
1344 }
1345 }
1346
1347 for _ in 0..pending_dedents {
1349 structural_events.push((self.input_len, false));
1350 }
1351
1352 if !self.escape_body_ranges.is_empty() {
1357 let mut filtered = Vec::new();
1361 for &(pos, is_indent) in &structural_events {
1362 let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1363 pos > *start && pos < *end
1365 });
1366 if !is_inside_escape_body {
1367 filtered.push((pos, is_indent));
1368 }
1369 }
1370 structural_events = filtered;
1371 }
1372
1373 {
1377 let string_spans: Vec<(usize, usize)> = tokens.iter()
1378 .filter(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)))
1379 .filter(|t| t.span.end - t.span.start > 6) .map(|t| (t.span.start, t.span.end))
1381 .collect();
1382 if !string_spans.is_empty() {
1383 structural_events.retain(|&(pos, _)| {
1384 !string_spans.iter().any(|(start, end)| pos > *start && pos < *end)
1385 });
1386 }
1387 }
1388
1389 structural_events.sort_by(|a, b| {
1391 if a.0 != b.0 {
1392 a.0.cmp(&b.0)
1393 } else {
1394 a.1.cmp(&b.1)
1396 }
1397 });
1398
1399 let mut event_idx = 0;
1404 let mut last_colon_pos: Option<usize> = None;
1405
1406 for token in tokens.iter() {
1407 let token_start = token.span.start;
1408
1409 while event_idx < structural_events.len() {
1411 let (event_pos, is_indent) = structural_events[event_idx];
1412
1413 if event_pos <= token_start {
1415 let span = if is_indent {
1416 Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1418 } else {
1419 Span::new(event_pos, event_pos)
1420 };
1421 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1422 result.push(Token::new(kind, empty_sym, span));
1423 event_idx += 1;
1424 } else {
1425 break;
1426 }
1427 }
1428
1429 result.push(token.clone());
1430
1431 if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1433 last_colon_pos = Some(token.span.end);
1434 }
1435 }
1436
1437 while event_idx < structural_events.len() {
1439 let (event_pos, is_indent) = structural_events[event_idx];
1440 let span = Span::new(event_pos, event_pos);
1441 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1442 result.push(Token::new(kind, empty_sym, span));
1443 event_idx += 1;
1444 }
1445
1446 let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1448 if let Some(pos) = eof_pos {
1449 let eof = result.remove(pos);
1450 result.push(eof);
1451 }
1452
1453 result
1454 }
1455
1456 fn is_end_of_line(&self, from_pos: usize) -> bool {
1458 let bytes = self.source.as_bytes();
1459 let mut pos = from_pos;
1460 while pos < bytes.len() {
1461 match bytes[pos] {
1462 b' ' | b'\t' => pos += 1,
1463 b'\n' => return true,
1464 _ => return false,
1465 }
1466 }
1467 true }
1469
1470 fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1471 let bytes = self.source.as_bytes();
1472 let mut pos = from_pos;
1473
1474 while pos < bytes.len() && bytes[pos] != b'\n' {
1475 pos += 1;
1476 }
1477
1478 if pos >= bytes.len() {
1479 return None;
1480 }
1481
1482 pos += 1;
1483
1484 let mut indent = 0;
1485 while pos < bytes.len() {
1486 match bytes[pos] {
1487 b' ' => indent += 1,
1488 b'\t' => indent += 4,
1489 b'\n' => {
1490 indent = 0;
1491 }
1492 _ => break,
1493 }
1494 pos += 1;
1495 }
1496
1497 if pos >= bytes.len() {
1498 return None;
1499 }
1500
1501 Some(indent)
1502 }
1503
1504 fn word_to_number(word: &str) -> Option<u32> {
1505 lexicon::word_to_number(&word.to_lowercase())
1506 }
1507
1508 fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1514 let word_chars: Vec<char> = current_word.chars().collect();
1516
1517 if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1519 if char_idx + 5 < chars.len()
1521 && chars[char_idx + 1].is_ascii_digit()
1522 && chars[char_idx + 2].is_ascii_digit()
1523 && chars[char_idx + 3] == '-'
1524 && chars[char_idx + 4].is_ascii_digit()
1525 && chars[char_idx + 5].is_ascii_digit()
1526 {
1527 return true;
1528 }
1529 }
1530
1531 if word_chars.len() == 7
1533 && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1534 && word_chars[4] == '-'
1535 && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1536 {
1537 if char_idx + 2 < chars.len()
1539 && chars[char_idx + 1].is_ascii_digit()
1540 && chars[char_idx + 2].is_ascii_digit()
1541 {
1542 let next_not_digit = char_idx + 3 >= chars.len()
1544 || !chars[char_idx + 3].is_ascii_digit();
1545 if next_not_digit {
1546 return true;
1547 }
1548 }
1549 }
1550
1551 false
1552 }
1553
1554 fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1560 let word_chars: Vec<char> = current_word.chars().collect();
1562 if word_chars.is_empty() || word_chars.len() > 2 {
1563 return false;
1564 }
1565 if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1566 return false;
1567 }
1568
1569 if char_idx + 4 < chars.len()
1571 && chars[char_idx + 1].is_ascii_digit()
1572 && chars[char_idx + 2].is_ascii_digit()
1573 {
1574 let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1576 let lower = next_two.to_lowercase();
1577 if lower == "am" || lower == "pm" {
1578 let after_suffix = char_idx + 5 >= chars.len()
1580 || !chars[char_idx + 5].is_alphabetic();
1581 if after_suffix {
1582 return true;
1583 }
1584 }
1585 }
1586
1587 false
1588 }
1589
1590 fn has_unescaped_brace(content: &str) -> bool {
1593 let bytes = content.as_bytes();
1594 let mut i = 0;
1595 while i < bytes.len() {
1596 if bytes[i] == b'{' {
1597 if i + 1 < bytes.len() && bytes[i + 1] == b'{' {
1598 i += 2;
1599 } else {
1600 return true;
1601 }
1602 } else {
1603 i += 1;
1604 }
1605 }
1606 false
1607 }
1608
1609 fn is_exponent_sign(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1615 if !current_word.ends_with('e') && !current_word.ends_with('E') {
1617 return false;
1618 }
1619 let before_e = ¤t_word[..current_word.len() - 1];
1621 if before_e.is_empty() || !before_e.chars().next().unwrap().is_ascii_digit() {
1622 return false;
1623 }
1624 char_idx + 1 < chars.len() && chars[char_idx + 1].is_ascii_digit()
1626 }
1627
1628 fn dedent_triple_quote(raw: &str) -> String {
1631 let lines: Vec<&str> = raw.lines().collect();
1632 if lines.is_empty() {
1633 return String::new();
1634 }
1635 let min_indent = lines.iter()
1637 .filter(|l| !l.trim().is_empty())
1638 .map(|l| l.len() - l.trim_start().len())
1639 .min()
1640 .unwrap_or(0);
1641 lines.iter()
1643 .map(|l| {
1644 if l.len() >= min_indent {
1645 &l[min_indent..]
1646 } else {
1647 l.trim()
1648 }
1649 })
1650 .collect::<Vec<_>>()
1651 .join("\n")
1652 }
1653
1654 fn is_numeric_literal(word: &str) -> bool {
1655 if word.is_empty() {
1656 return false;
1657 }
1658 let chars: Vec<char> = word.chars().collect();
1659 let first = chars[0];
1660 if first.is_ascii_digit() {
1661 return true;
1663 }
1664 if let Some(underscore_pos) = word.rfind('_') {
1667 let before_underscore = &word[..underscore_pos];
1668 let after_underscore = &word[underscore_pos + 1..];
1669 let is_math_symbol = matches!(
1671 before_underscore.to_lowercase().as_str(),
1672 "aleph" | "omega" | "beth"
1673 );
1674 if is_math_symbol
1675 && !after_underscore.is_empty()
1676 && after_underscore.chars().all(|c| c.is_ascii_digit())
1677 {
1678 return true;
1679 }
1680 }
1681 false
1682 }
1683
1684 fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1697 if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1698 return None;
1699 }
1700
1701 const SUFFIXES: &[(&str, i64)] = &[
1703 ("ns", 1),
1704 ("μs", 1_000),
1705 ("us", 1_000),
1706 ("ms", 1_000_000),
1707 ("sec", 1_000_000_000),
1708 ("s", 1_000_000_000),
1709 ("min", 60_000_000_000),
1710 ("hr", 3_600_000_000_000),
1711 ("h", 3_600_000_000_000),
1712 ];
1713
1714 for (suffix, multiplier) in SUFFIXES {
1716 if word.ends_with(suffix) {
1717 let num_part = &word[..word.len() - suffix.len()];
1718 let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1720 if let Ok(n) = cleaned.parse::<i64>() {
1721 return Some((n.saturating_mul(*multiplier), *suffix));
1722 }
1723 }
1724 }
1725
1726 None
1727 }
1728
1729 fn parse_date_literal(word: &str) -> Option<i32> {
1734 if word.len() != 10 {
1736 return None;
1737 }
1738
1739 let bytes = word.as_bytes();
1740
1741 if bytes[4] != b'-' || bytes[7] != b'-' {
1743 return None;
1744 }
1745
1746 let year: i32 = word[0..4].parse().ok()?;
1748 let month: u32 = word[5..7].parse().ok()?;
1749 let day: u32 = word[8..10].parse().ok()?;
1750
1751 if month < 1 || month > 12 || day < 1 || day > 31 {
1753 return None;
1754 }
1755
1756 let y = if month <= 2 { year - 1 } else { year };
1759 let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1760 let yoe = (y - era * 400) as u32;
1761 let m = month;
1762 let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1763 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1764 let days = era * 146097 + doe as i32 - 719468;
1765
1766 Some(days)
1767 }
1768
1769 fn parse_time_literal(word: &str) -> Option<i64> {
1778 let lower = word.to_lowercase();
1779
1780 if lower == "noon" {
1782 return Some(12i64 * 3600 * 1_000_000_000);
1783 }
1784 if lower == "midnight" {
1785 return Some(0);
1786 }
1787
1788 let is_pm = lower.ends_with("pm");
1790 let is_am = lower.ends_with("am");
1791
1792 if !is_pm && !is_am {
1793 return None;
1794 }
1795
1796 let time_part = &lower[..lower.len() - 2];
1798
1799 let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1801 let hour_str = &time_part[..colon_idx];
1802 let min_str = &time_part[colon_idx + 1..];
1803 let h: i64 = hour_str.parse().ok()?;
1804 let m: i64 = min_str.parse().ok()?;
1805 (h, m)
1806 } else {
1807 let h: i64 = time_part.parse().ok()?;
1809 (h, 0)
1810 };
1811
1812 if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1814 return None;
1815 }
1816
1817 let hour_24 = if is_am {
1819 if hour == 12 { 0 } else { hour } } else {
1821 if hour == 12 { 12 } else { hour + 12 } };
1823
1824 let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1826 Some(nanos)
1827 }
1828
1829 fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1830 if word.starts_with("##") {
1832 let block_name = &word[2..];
1833 let block_type = match block_name.to_lowercase().as_str() {
1834 "theorem" => BlockType::Theorem,
1835 "main" => BlockType::Main,
1836 "definition" => BlockType::Definition,
1837 "proof" => BlockType::Proof,
1838 "example" => BlockType::Example,
1839 "logic" => BlockType::Logic,
1840 "note" => BlockType::Note,
1841 "to" => BlockType::Function, "a" | "an" => BlockType::TypeDef, "policy" => BlockType::Policy, "requires" => BlockType::Requires, "no" => BlockType::No, _ => BlockType::Note, };
1848
1849 self.mode = match block_type {
1851 BlockType::Main | BlockType::Function => LexerMode::Imperative,
1852 _ => LexerMode::Declarative,
1853 };
1854
1855 return TokenType::BlockHeader { block_type };
1856 }
1857
1858 let lower = word.to_lowercase();
1859
1860 if lower == "each" && self.peek_sequence(&["other"]) {
1861 self.consume_words(1);
1862 return TokenType::Reciprocal;
1863 }
1864
1865 if lower == "to" {
1866 if let Some(next) = self.peek_word(1) {
1867 if self.is_verb_like(next) {
1868 return TokenType::To;
1869 }
1870 }
1871 let sym = self.interner.intern("to");
1872 return TokenType::Preposition(sym);
1873 }
1874
1875 if lower == "at" {
1876 if let Some(next) = self.peek_word(1) {
1877 let next_lower = next.to_lowercase();
1878 if next_lower == "least" {
1879 if let Some(num_word) = self.peek_word(2) {
1880 if let Some(n) = Self::word_to_number(num_word) {
1881 self.consume_words(2);
1882 return TokenType::AtLeast(n);
1883 }
1884 }
1885 }
1886 if next_lower == "most" {
1887 if let Some(num_word) = self.peek_word(2) {
1888 if let Some(n) = Self::word_to_number(num_word) {
1889 self.consume_words(2);
1890 return TokenType::AtMost(n);
1891 }
1892 }
1893 }
1894 }
1895 }
1896
1897 if let Some(n) = Self::word_to_number(&lower) {
1898 return TokenType::Cardinal(n);
1899 }
1900
1901 if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1903 let unit_sym = self.interner.intern(unit);
1904 return TokenType::DurationLiteral {
1905 nanos,
1906 original_unit: unit_sym,
1907 };
1908 }
1909
1910 if let Some(days) = Self::parse_date_literal(word) {
1912 return TokenType::DateLiteral { days };
1913 }
1914
1915 if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1917 return TokenType::TimeLiteral { nanos_from_midnight };
1918 }
1919
1920 if Self::is_numeric_literal(word) {
1921 let sym = self.interner.intern(word);
1922 return TokenType::Number(sym);
1923 }
1924
1925 if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1926 self.consume_words(3);
1927 return TokenType::Iff;
1928 }
1929
1930 if lower == "is" {
1931 if self.peek_sequence(&["equal", "to"]) {
1932 self.consume_words(2);
1933 return TokenType::Identity;
1934 }
1935 if self.peek_sequence(&["identical", "to"]) {
1936 self.consume_words(2);
1937 return TokenType::Identity;
1938 }
1939 }
1940
1941 if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1942 if let Some(next) = self.peek_word(1) {
1945 let next_lower = next.to_lowercase();
1946 let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1947
1948 if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1950 let sym = self.interner.intern(word);
1951 return TokenType::ProperName(sym);
1952 }
1953
1954 let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1959 && !lexicon::is_disambiguation_not_verb(&next_lower);
1960 let is_gerund = next_lower.ends_with("ing");
1961 let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1962 if is_verb && !is_gerund && !is_also_noun_or_adj {
1963 let sym = self.interner.intern(word);
1964 return TokenType::ProperName(sym);
1965 }
1966
1967 if let Some(third) = self.peek_word(2) {
1970 let third_lower = third.to_lowercase();
1971 if third_lower == "is" || third_lower == "are" || third_lower == "has" {
1973 return TokenType::Article(Definiteness::Indefinite);
1974 }
1975 }
1976
1977 let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1981 if is_content_word || next_starts_lowercase {
1982 return TokenType::Article(Definiteness::Indefinite);
1983 }
1984 }
1985 let sym = self.interner.intern(word);
1986 return TokenType::ProperName(sym);
1987 }
1988
1989 self.classify_word(word)
1990 }
1991
1992 fn is_noun_like(&self, word: &str) -> bool {
1993 if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
1994 return true;
1995 }
1996 if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
1997 return true;
1998 }
1999 false
2000 }
2001
2002 fn is_adjective_like(&self, word: &str) -> bool {
2003 lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
2004 }
2005
2006 fn classify_word(&mut self, word: &str) -> TokenType {
2007 let lower = word.to_lowercase();
2008 let first_char = word.chars().next().unwrap();
2009
2010 if lower == "that" {
2013 if let Some(next) = self.peek_word(1) {
2014 let next_lower = next.to_lowercase();
2015 if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
2016 return TokenType::Article(Definiteness::Distal);
2017 }
2018 }
2019 }
2020
2021 if word == "->" {
2023 return TokenType::Arrow;
2024 }
2025
2026 if word == "<=" {
2028 return TokenType::LtEq;
2029 }
2030 if word == ">=" {
2031 return TokenType::GtEq;
2032 }
2033 if word == "==" {
2034 return TokenType::EqEq;
2035 }
2036 if word == "!=" {
2037 return TokenType::NotEq;
2038 }
2039 if word == "<" {
2040 return TokenType::Lt;
2041 }
2042 if word == ">" {
2043 return TokenType::Gt;
2044 }
2045 if word == "=" {
2047 return TokenType::Assign;
2048 }
2049
2050 if let Some(kind) = lexicon::lookup_keyword(&lower) {
2051 return kind;
2052 }
2053
2054 if let Some(kind) = lexicon::lookup_pronoun(&lower) {
2055 return kind;
2056 }
2057
2058 if let Some(def) = lexicon::lookup_article(&lower) {
2059 return TokenType::Article(def);
2060 }
2061
2062 if let Some(time) = lexicon::lookup_auxiliary(&lower) {
2063 return TokenType::Auxiliary(time);
2064 }
2065
2066 match lower.as_str() {
2068 "call" => return TokenType::Call,
2069 "in" if self.mode == LexerMode::Imperative => return TokenType::In,
2070 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2072 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2074 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2076 "before" => return TokenType::Before,
2078 _ => {}
2079 }
2080
2081 if lexicon::is_preposition(&lower) {
2082 let sym = self.interner.intern(&lower);
2083 return TokenType::Preposition(sym);
2084 }
2085
2086 match lower.as_str() {
2087 "equals" => return TokenType::Equals,
2088 "item" => return TokenType::Item,
2089 "items" => return TokenType::Items,
2090 "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
2092 "let" => {
2093 self.in_let_context = true;
2094 return TokenType::Let;
2095 }
2096 "set" => {
2097 if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
2100 } else if self.mode == LexerMode::Imperative {
2102 return TokenType::Set;
2104 } else {
2105 for offset in 2..=5 {
2108 if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
2109 return TokenType::Set;
2110 }
2111 }
2112 }
2113 }
2114 "return" => return TokenType::Return,
2115 "break" => return TokenType::Break,
2116 "xor" => return TokenType::Xor,
2117 "shifted" => return TokenType::Shifted,
2118 "be" if self.in_let_context => {
2119 self.in_let_context = false;
2120 return TokenType::Be;
2121 }
2122 "while" => return TokenType::While,
2123 "assert" => return TokenType::Assert,
2124 "trust" => return TokenType::Trust,
2125 "check" => return TokenType::Check,
2126 "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
2128 "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
2129 "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
2130 "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
2132 "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
2133 "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
2134 "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2136 "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2138 "persistent" => return TokenType::Persistent, "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2140 "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2144 "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2145 "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2146 "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2147 "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2148 "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2149 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2150 "native" => return TokenType::Native,
2151 "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2152 "from" => return TokenType::From,
2153 "otherwise" => return TokenType::Otherwise,
2154 "else" => return TokenType::Else,
2156 "elif" => return TokenType::Elif,
2157 "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2159 "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2161 "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2163 "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2166 "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2167 "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2169 "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2170 "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2171 "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2172 "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2173 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2174 "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2176 "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2177 "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2178 "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2179 "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2180 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2182 "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2183 "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2184 "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2185 "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2186 "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2188 "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2189 "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2190 "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2192 "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2193 "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2194 "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2195 "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2197 "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2198 "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2199 "portable" => return TokenType::Portable,
2201 "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2203 "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2204 "shared" => return TokenType::Shared, "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2207 "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2208 "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2210 "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2211 "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2212 "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2213 "tally" => return TokenType::Tally,
2215 "sharedset" => return TokenType::SharedSet,
2216 "sharedsequence" => return TokenType::SharedSequence,
2217 "collaborativesequence" => return TokenType::CollaborativeSequence,
2218 "sharedmap" => return TokenType::SharedMap,
2219 "divergent" => return TokenType::Divergent,
2220 "removewins" => return TokenType::RemoveWins,
2221 "addwins" => return TokenType::AddWins,
2222 "yata" => return TokenType::YATA,
2223 "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2225 "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2226 "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2227 "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2228 "ago" => return TokenType::Ago,
2230 "hence" => return TokenType::Hence,
2231 "if" => return TokenType::If,
2232 "only" => return TokenType::Focus(FocusKind::Only),
2233 "even" => return TokenType::Focus(FocusKind::Even),
2234 "just" if self.peek_word(1).map_or(false, |w| {
2235 !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2236 }) => return TokenType::Focus(FocusKind::Just),
2237 "much" => return TokenType::Measure(MeasureKind::Much),
2238 "little" => return TokenType::Measure(MeasureKind::Little),
2239 _ => {}
2240 }
2241
2242 if lexicon::is_scopal_adverb(&lower) {
2243 let sym = self.interner.intern(&Self::capitalize(&lower));
2244 return TokenType::ScopalAdverb(sym);
2245 }
2246
2247 if lexicon::is_temporal_adverb(&lower) {
2248 let sym = self.interner.intern(&Self::capitalize(&lower));
2249 return TokenType::TemporalAdverb(sym);
2250 }
2251
2252 if lexicon::is_non_intersective(&lower) {
2253 let sym = self.interner.intern(&Self::capitalize(&lower));
2254 return TokenType::NonIntersectiveAdjective(sym);
2255 }
2256
2257 if lexicon::is_adverb(&lower) {
2258 let sym = self.interner.intern(&Self::capitalize(&lower));
2259 return TokenType::Adverb(sym);
2260 }
2261 if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2262 let sym = self.interner.intern(&Self::capitalize(&lower));
2263 return TokenType::Adverb(sym);
2264 }
2265
2266 if let Some(base) = self.try_parse_superlative(&lower) {
2267 let sym = self.interner.intern(&base);
2268 return TokenType::Superlative(sym);
2269 }
2270
2271 let irregular_comparative = match lower.as_str() {
2273 "less" => Some("Little"),
2274 "more" => Some("Much"),
2275 "better" => Some("Good"),
2276 "worse" => Some("Bad"),
2277 _ => None,
2278 };
2279 if let Some(base) = irregular_comparative {
2280 let sym = self.interner.intern(base);
2281 return TokenType::Comparative(sym);
2282 }
2283
2284 if let Some(base) = self.try_parse_comparative(&lower) {
2285 let sym = self.interner.intern(&base);
2286 return TokenType::Comparative(sym);
2287 }
2288
2289 if lexicon::is_performative(&lower) {
2290 let sym = self.interner.intern(&Self::capitalize(&lower));
2291 return TokenType::Performative(sym);
2292 }
2293
2294 if lexicon::is_base_verb_early(&lower) {
2295 let sym = self.interner.intern(&Self::capitalize(&lower));
2296 let class = lexicon::lookup_verb_class(&lower);
2297 return TokenType::Verb {
2298 lemma: sym,
2299 time: Time::Present,
2300 aspect: Aspect::Simple,
2301 class,
2302 };
2303 }
2304
2305 if lower.ends_with("ing") && lower.len() > 4 {
2308 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2309 let sym = self.interner.intern(&entry.lemma);
2310 return TokenType::Verb {
2311 lemma: sym,
2312 time: entry.time,
2313 aspect: entry.aspect,
2314 class: entry.class,
2315 };
2316 }
2317 }
2318
2319 if first_char.is_uppercase() {
2320 if let Some(next) = self.peek_word(1) {
2327 let next_lower = next.to_lowercase();
2328 let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2330 || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2331
2332 if is_followed_by_verb {
2333 if let Some(analysis) = lexicon::analyze_word(&lower) {
2335 match analysis {
2336 lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2337 let sym = self.interner.intern(&lower);
2339 return TokenType::Noun(sym);
2340 }
2341 lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2342 let sym = self.interner.intern(&lower);
2344 return TokenType::Noun(sym);
2345 }
2346 _ => {
2347 }
2350 }
2351 }
2352 }
2353 }
2354
2355 let sym = self.interner.intern(word);
2356 return TokenType::ProperName(sym);
2357 }
2358
2359 let verb_entry = self.lexicon.lookup_verb(&lower);
2360 let is_noun = lexicon::is_common_noun(&lower);
2361 let is_adj = self.is_adjective_like(&lower);
2362 let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2363
2364 if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2366 let entry = verb_entry.unwrap();
2367 let verb_token = TokenType::Verb {
2368 lemma: self.interner.intern(&entry.lemma),
2369 time: entry.time,
2370 aspect: entry.aspect,
2371 class: entry.class,
2372 };
2373
2374 let mut alternatives = Vec::new();
2375 if is_noun {
2376 alternatives.push(TokenType::Noun(self.interner.intern(word)));
2377 }
2378 if is_adj {
2379 alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2380 }
2381
2382 return TokenType::Ambiguous {
2383 primary: Box::new(verb_token),
2384 alternatives,
2385 };
2386 }
2387
2388 if let Some(_) = &verb_entry {
2390 if is_disambiguated {
2391 let sym = self.interner.intern(word);
2392 if is_noun {
2393 return TokenType::Noun(sym);
2394 }
2395 return TokenType::Adjective(sym);
2396 }
2397 }
2398
2399 if let Some(entry) = verb_entry {
2401 let sym = self.interner.intern(&entry.lemma);
2402 return TokenType::Verb {
2403 lemma: sym,
2404 time: entry.time,
2405 aspect: entry.aspect,
2406 class: entry.class,
2407 };
2408 }
2409
2410 if is_noun {
2412 let sym = self.interner.intern(word);
2413 return TokenType::Noun(sym);
2414 }
2415
2416 if lexicon::is_base_verb(&lower) {
2417 let sym = self.interner.intern(&Self::capitalize(&lower));
2418 let class = lexicon::lookup_verb_class(&lower);
2419 return TokenType::Verb {
2420 lemma: sym,
2421 time: Time::Present,
2422 aspect: Aspect::Simple,
2423 class,
2424 };
2425 }
2426
2427 if lower.ends_with("ian")
2428 || lower.ends_with("er")
2429 || lower == "logic"
2430 || lower == "time"
2431 || lower == "men"
2432 || lower == "book"
2433 || lower == "house"
2434 || lower == "code"
2435 || lower == "user"
2436 {
2437 let sym = self.interner.intern(word);
2438 return TokenType::Noun(sym);
2439 }
2440
2441 if lexicon::is_particle(&lower) {
2442 let sym = self.interner.intern(&lower);
2443 return TokenType::Particle(sym);
2444 }
2445
2446 let sym = self.interner.intern(word);
2447 TokenType::Adjective(sym)
2448 }
2449
2450 fn capitalize(s: &str) -> String {
2451 let mut chars = s.chars();
2452 match chars.next() {
2453 None => String::new(),
2454 Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2455 }
2456 }
2457
2458 pub fn is_collective_verb(lemma: &str) -> bool {
2459 lexicon::is_collective_verb(&lemma.to_lowercase())
2460 }
2461
2462 pub fn is_mixed_verb(lemma: &str) -> bool {
2463 lexicon::is_mixed_verb(&lemma.to_lowercase())
2464 }
2465
2466 pub fn is_distributive_verb(lemma: &str) -> bool {
2467 lexicon::is_distributive_verb(&lemma.to_lowercase())
2468 }
2469
2470 pub fn is_intensional_predicate(lemma: &str) -> bool {
2471 lexicon::is_intensional_predicate(&lemma.to_lowercase())
2472 }
2473
2474 pub fn is_opaque_verb(lemma: &str) -> bool {
2475 lexicon::is_opaque_verb(&lemma.to_lowercase())
2476 }
2477
2478 pub fn is_ditransitive_verb(lemma: &str) -> bool {
2479 lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2480 }
2481
2482 fn is_verb_like(&self, word: &str) -> bool {
2483 let lower = word.to_lowercase();
2484 if lexicon::is_infinitive_verb(&lower) {
2485 return true;
2486 }
2487 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2488 return entry.lemma.len() > 0;
2489 }
2490 false
2491 }
2492
2493 pub fn is_subject_control_verb(lemma: &str) -> bool {
2494 lexicon::is_subject_control_verb(&lemma.to_lowercase())
2495 }
2496
2497 pub fn is_raising_verb(lemma: &str) -> bool {
2498 lexicon::is_raising_verb(&lemma.to_lowercase())
2499 }
2500
2501 pub fn is_object_control_verb(lemma: &str) -> bool {
2502 lexicon::is_object_control_verb(&lemma.to_lowercase())
2503 }
2504
2505 pub fn is_weather_verb(lemma: &str) -> bool {
2506 matches!(
2507 lemma.to_lowercase().as_str(),
2508 "rain" | "snow" | "hail" | "thunder" | "pour"
2509 )
2510 }
2511
2512 fn try_parse_superlative(&self, word: &str) -> Option<String> {
2513 if !word.ends_with("est") || word.len() < 5 {
2514 return None;
2515 }
2516
2517 let base = &word[..word.len() - 3];
2518
2519 if base.len() >= 2 {
2520 let chars: Vec<char> = base.chars().collect();
2521 let last = chars[chars.len() - 1];
2522 let second_last = chars[chars.len() - 2];
2523 if last == second_last && !"aeiou".contains(last) {
2524 let stem = &base[..base.len() - 1];
2525 if lexicon::is_gradable_adjective(stem) {
2526 return Some(Self::capitalize(stem));
2527 }
2528 }
2529 }
2530
2531 if base.ends_with("i") {
2532 let stem = format!("{}y", &base[..base.len() - 1]);
2533 if lexicon::is_gradable_adjective(&stem) {
2534 return Some(Self::capitalize(&stem));
2535 }
2536 }
2537
2538 if lexicon::is_gradable_adjective(base) {
2539 return Some(Self::capitalize(base));
2540 }
2541
2542 None
2543 }
2544
2545 fn try_parse_comparative(&self, word: &str) -> Option<String> {
2546 if !word.ends_with("er") || word.len() < 4 {
2547 return None;
2548 }
2549
2550 let base = &word[..word.len() - 2];
2551
2552 if base.len() >= 2 {
2553 let chars: Vec<char> = base.chars().collect();
2554 let last = chars[chars.len() - 1];
2555 let second_last = chars[chars.len() - 2];
2556 if last == second_last && !"aeiou".contains(last) {
2557 let stem = &base[..base.len() - 1];
2558 if lexicon::is_gradable_adjective(stem) {
2559 return Some(Self::capitalize(stem));
2560 }
2561 }
2562 }
2563
2564 if base.ends_with("i") {
2565 let stem = format!("{}y", &base[..base.len() - 1]);
2566 if lexicon::is_gradable_adjective(&stem) {
2567 return Some(Self::capitalize(&stem));
2568 }
2569 }
2570
2571 if lexicon::is_gradable_adjective(base) {
2572 return Some(Self::capitalize(base));
2573 }
2574
2575 None
2576 }
2577}
2578
2579#[cfg(test)]
2580mod tests {
2581 use super::*;
2582
2583 #[test]
2584 fn lexer_handles_apostrophe() {
2585 let mut interner = Interner::new();
2586 let mut lexer = Lexer::new("it's raining", &mut interner);
2587 let tokens = lexer.tokenize();
2588 assert!(!tokens.is_empty());
2589 }
2590
2591 #[test]
2592 fn lexer_handles_question_mark() {
2593 let mut interner = Interner::new();
2594 let mut lexer = Lexer::new("Is it raining?", &mut interner);
2595 let tokens = lexer.tokenize();
2596 assert!(!tokens.is_empty());
2597 }
2598
2599 #[test]
2600 fn ring_is_not_verb() {
2601 let mut interner = Interner::new();
2602 let mut lexer = Lexer::new("ring", &mut interner);
2603 let tokens = lexer.tokenize();
2604 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2605 }
2606
2607 #[test]
2608 fn debug_that_token() {
2609 let mut interner = Interner::new();
2610 let mut lexer = Lexer::new("The cat that runs", &mut interner);
2611 let tokens = lexer.tokenize();
2612 for (i, t) in tokens.iter().enumerate() {
2613 let lex = interner.resolve(t.lexeme);
2614 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2615 }
2616 let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2617 if let Some(t) = that_token {
2618 let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2620 eprintln!("Discriminant check for That: {}", check);
2621 assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2622 } else {
2623 panic!("No 'that' token found");
2624 }
2625 }
2626
2627 #[test]
2628 fn bus_is_not_verb() {
2629 let mut interner = Interner::new();
2630 let mut lexer = Lexer::new("bus", &mut interner);
2631 let tokens = lexer.tokenize();
2632 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2633 }
2634
2635 #[test]
2636 fn lowercase_a_is_article() {
2637 let mut interner = Interner::new();
2638 let mut lexer = Lexer::new("a car", &mut interner);
2639 let tokens = lexer.tokenize();
2640 for (i, t) in tokens.iter().enumerate() {
2641 let lex = interner.resolve(t.lexeme);
2642 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2643 }
2644 assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2645 assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2646 }
2647
2648 #[test]
2649 fn open_is_ambiguous() {
2650 let mut interner = Interner::new();
2651 let mut lexer = Lexer::new("open", &mut interner);
2652 let tokens = lexer.tokenize();
2653
2654 if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2655 assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2656 assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2657 "Should have Adjective alternative");
2658 } else {
2659 panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2660 }
2661 }
2662
2663 #[test]
2664 fn basic_tokenization() {
2665 let mut interner = Interner::new();
2666 let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2667 let tokens = lexer.tokenize();
2668 assert_eq!(tokens[0].kind, TokenType::All);
2669 assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2670 assert_eq!(tokens[2].kind, TokenType::Are);
2671 }
2672
2673 #[test]
2674 fn iff_tokenizes_as_single_token() {
2675 let mut interner = Interner::new();
2676 let mut lexer = Lexer::new("A if and only if B", &mut interner);
2677 let tokens = lexer.tokenize();
2678 assert!(
2679 tokens.iter().any(|t| t.kind == TokenType::Iff),
2680 "should contain Iff token: got {:?}",
2681 tokens
2682 );
2683 }
2684
2685 #[test]
2686 fn is_equal_to_tokenizes_as_identity() {
2687 let mut interner = Interner::new();
2688 let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2689 let tokens = lexer.tokenize();
2690 assert!(
2691 tokens.iter().any(|t| t.kind == TokenType::Identity),
2692 "should contain Identity token: got {:?}",
2693 tokens
2694 );
2695 }
2696
2697 #[test]
2698 fn is_identical_to_tokenizes_as_identity() {
2699 let mut interner = Interner::new();
2700 let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2701 let tokens = lexer.tokenize();
2702 assert!(
2703 tokens.iter().any(|t| t.kind == TokenType::Identity),
2704 "should contain Identity token: got {:?}",
2705 tokens
2706 );
2707 }
2708
2709 #[test]
2710 fn itself_tokenizes_as_reflexive() {
2711 let mut interner = Interner::new();
2712 let mut lexer = Lexer::new("John loves itself", &mut interner);
2713 let tokens = lexer.tokenize();
2714 assert!(
2715 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2716 "should contain Reflexive token: got {:?}",
2717 tokens
2718 );
2719 }
2720
2721 #[test]
2722 fn himself_tokenizes_as_reflexive() {
2723 let mut interner = Interner::new();
2724 let mut lexer = Lexer::new("John sees himself", &mut interner);
2725 let tokens = lexer.tokenize();
2726 assert!(
2727 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2728 "should contain Reflexive token: got {:?}",
2729 tokens
2730 );
2731 }
2732
2733 #[test]
2734 fn to_stay_tokenizes_correctly() {
2735 let mut interner = Interner::new();
2736 let mut lexer = Lexer::new("to stay", &mut interner);
2737 let tokens = lexer.tokenize();
2738 assert!(
2739 tokens.iter().any(|t| t.kind == TokenType::To),
2740 "should contain To token: got {:?}",
2741 tokens
2742 );
2743 assert!(
2744 tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2745 "should contain Verb token for stay: got {:?}",
2746 tokens
2747 );
2748 }
2749
2750 #[test]
2751 fn possessive_apostrophe_s() {
2752 let mut interner = Interner::new();
2753 let mut lexer = Lexer::new("John's dog", &mut interner);
2754 let tokens = lexer.tokenize();
2755 assert!(
2756 tokens.iter().any(|t| t.kind == TokenType::Possessive),
2757 "should contain Possessive token: got {:?}",
2758 tokens
2759 );
2760 assert!(
2761 tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2762 "should have John as proper name: got {:?}",
2763 tokens
2764 );
2765 }
2766
2767 #[test]
2768 fn lexer_produces_valid_spans() {
2769 let input = "All men are mortal.";
2770 let mut interner = Interner::new();
2771 let mut lexer = Lexer::new(input, &mut interner);
2772 let tokens = lexer.tokenize();
2773
2774 assert_eq!(tokens[0].span.start, 0);
2776 assert_eq!(tokens[0].span.end, 3);
2777 assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2778
2779 assert_eq!(tokens[1].span.start, 4);
2781 assert_eq!(tokens[1].span.end, 7);
2782 assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2783
2784 assert_eq!(tokens[2].span.start, 8);
2786 assert_eq!(tokens[2].span.end, 11);
2787 assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2788
2789 assert_eq!(tokens[3].span.start, 12);
2791 assert_eq!(tokens[3].span.end, 18);
2792 assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2793
2794 assert_eq!(tokens[4].span.start, 18);
2796 assert_eq!(tokens[4].span.end, 19);
2797
2798 assert_eq!(tokens[5].span.start, input.len());
2800 assert_eq!(tokens[5].kind, TokenType::EOF);
2801 }
2802
2803 #[test]
2804 fn triple_quote_produces_string_token() {
2805 let mut interner = Interner::new();
2806 let source = "## Main\nLet msg be \"\"\"\n Hello\n World\n\"\"\".\nShow msg.";
2807 let mut lexer = Lexer::new(source, &mut interner);
2808 let tokens = lexer.tokenize();
2809 for (i, t) in tokens.iter().enumerate() {
2811 let lex = interner.resolve(t.lexeme);
2812 eprintln!("Token[{}]: {:?} lex={:?} span={}..{}", i, t.kind, lex, t.span.start, t.span.end);
2813 }
2814 let str_token = tokens.iter().find(|t| matches!(t.kind, TokenType::StringLiteral(_) | TokenType::InterpolatedString(_)));
2816 assert!(str_token.is_some(), "Should have a string token. Tokens: {:?}", tokens.iter().map(|t| format!("{:?}", t.kind)).collect::<Vec<_>>());
2817 if let Some(tok) = str_token {
2818 let content = interner.resolve(tok.lexeme);
2819 eprintln!("Triple-quote content: {:?}", content);
2820 assert!(content.contains("Hello"), "Should contain Hello, got: {:?}", content);
2821 }
2822 }
2823}