1use logicaffeine_base::Interner;
38use crate::lexicon::{self, Aspect, Definiteness, Lexicon, Time};
39use crate::token::{BlockType, CalendarUnit, FocusKind, MeasureKind, Span, Token, TokenType};
40
41#[derive(Debug, Clone, PartialEq)]
49pub enum LineToken {
50 Indent,
52 Dedent,
54 Newline,
56 Content { text: String, start: usize, end: usize },
58}
59
60pub struct LineLexer<'a> {
63 source: &'a str,
64 bytes: &'a [u8],
65 indent_stack: Vec<usize>,
66 pending_dedents: usize,
67 position: usize,
68 has_pending_content: bool,
70 pending_content_start: usize,
71 pending_content_end: usize,
72 pending_content_text: String,
73 finished_lines: bool,
75 emitted_indent: bool,
77 escape_body_ranges: Vec<(usize, usize)>,
79}
80
81impl<'a> LineLexer<'a> {
82 pub fn new(source: &'a str) -> Self {
83 Self {
84 source,
85 bytes: source.as_bytes(),
86 indent_stack: vec![0],
87 pending_dedents: 0,
88 position: 0,
89 has_pending_content: false,
90 pending_content_start: 0,
91 pending_content_end: 0,
92 pending_content_text: String::new(),
93 finished_lines: false,
94 emitted_indent: false,
95 escape_body_ranges: Vec::new(),
96 }
97 }
98
99 pub fn with_escape_ranges(source: &'a str, escape_body_ranges: Vec<(usize, usize)>) -> Self {
100 Self {
101 source,
102 bytes: source.as_bytes(),
103 indent_stack: vec![0],
104 pending_dedents: 0,
105 position: 0,
106 has_pending_content: false,
107 pending_content_start: 0,
108 pending_content_end: 0,
109 pending_content_text: String::new(),
110 finished_lines: false,
111 emitted_indent: false,
112 escape_body_ranges,
113 }
114 }
115
116 fn is_in_escape_body(&self, pos: usize) -> bool {
118 self.escape_body_ranges.iter().any(|(start, end)| pos >= *start && pos < *end)
119 }
120
121 fn measure_indent(&self, line_start: usize) -> (usize, usize) {
124 let mut indent = 0;
125 let mut pos = line_start;
126
127 while pos < self.bytes.len() {
128 match self.bytes[pos] {
129 b' ' => {
130 indent += 1;
131 pos += 1;
132 }
133 b'\t' => {
134 indent += 4; pos += 1;
136 }
137 _ => break,
138 }
139 }
140
141 (indent, pos)
142 }
143
144 fn read_line_content(&self, content_start: usize) -> (String, usize, usize, usize) {
147 let mut pos = content_start;
148
149 while pos < self.bytes.len() && self.bytes[pos] != b'\n' {
151 pos += 1;
152 }
153
154 let content_end = pos;
155 let text = self.source[content_start..content_end].trim_end().to_string();
156
157 let next_line_start = if pos < self.bytes.len() && self.bytes[pos] == b'\n' {
159 pos + 1
160 } else {
161 pos
162 };
163
164 (text, content_start, content_end, next_line_start)
165 }
166
167 fn is_blank_line(&self, line_start: usize) -> bool {
169 let mut pos = line_start;
170 while pos < self.bytes.len() {
171 match self.bytes[pos] {
172 b' ' | b'\t' => pos += 1,
173 b'\n' => return true,
174 _ => return false,
175 }
176 }
177 true }
179
180 fn process_next_line(&mut self) -> bool {
183 while self.position < self.bytes.len() && self.is_blank_line(self.position) {
185 while self.position < self.bytes.len() && self.bytes[self.position] != b'\n' {
187 self.position += 1;
188 }
189 if self.position < self.bytes.len() {
190 self.position += 1; }
192 }
193
194 if self.position >= self.bytes.len() {
196 self.finished_lines = true;
197 if self.indent_stack.len() > 1 {
199 self.pending_dedents = self.indent_stack.len() - 1;
200 self.indent_stack.truncate(1);
201 }
202 return self.pending_dedents > 0;
203 }
204
205 let (line_indent, content_start) = self.measure_indent(self.position);
207
208 let (text, start, end, next_pos) = self.read_line_content(content_start);
210
211 if text.is_empty() {
213 self.position = next_pos;
214 return self.process_next_line();
215 }
216
217 let current_indent = *self.indent_stack.last().unwrap();
218
219 if line_indent > current_indent {
221 self.indent_stack.push(line_indent);
223 self.emitted_indent = true;
224 self.has_pending_content = true;
226 self.pending_content_text = text;
227 self.pending_content_start = start;
228 self.pending_content_end = end;
229 self.position = next_pos;
230 return true;
232 } else if line_indent < current_indent {
233 while self.indent_stack.len() > 1 {
235 let top = *self.indent_stack.last().unwrap();
236 if line_indent < top {
237 self.indent_stack.pop();
238 self.pending_dedents += 1;
239 } else {
240 break;
241 }
242 }
243 self.has_pending_content = true;
245 self.pending_content_text = text;
246 self.pending_content_start = start;
247 self.pending_content_end = end;
248 self.position = next_pos;
249 return true;
250 } else {
251 self.has_pending_content = true;
253 self.pending_content_text = text;
254 self.pending_content_start = start;
255 self.pending_content_end = end;
256 self.position = next_pos;
257 return true;
258 }
259 }
260}
261
262impl<'a> Iterator for LineLexer<'a> {
263 type Item = LineToken;
264
265 fn next(&mut self) -> Option<LineToken> {
266 if self.pending_dedents > 0 {
268 self.pending_dedents -= 1;
269 return Some(LineToken::Dedent);
270 }
271
272 if self.has_pending_content {
274 self.has_pending_content = false;
275 let text = std::mem::take(&mut self.pending_content_text);
276 let start = self.pending_content_start;
277 let end = self.pending_content_end;
278 return Some(LineToken::Content { text, start, end });
279 }
280
281 if !self.finished_lines {
287 let had_indent = self.indent_stack.len();
288 if self.process_next_line() {
289 if self.indent_stack.len() > had_indent {
291 return Some(LineToken::Indent);
292 }
293 if self.pending_dedents > 0 {
295 self.pending_dedents -= 1;
296 return Some(LineToken::Dedent);
297 }
298 if self.has_pending_content {
300 self.has_pending_content = false;
301 let text = std::mem::take(&mut self.pending_content_text);
302 let start = self.pending_content_start;
303 let end = self.pending_content_end;
304 return Some(LineToken::Content { text, start, end });
305 }
306 } else if self.pending_dedents > 0 {
307 self.pending_dedents -= 1;
309 return Some(LineToken::Dedent);
310 }
311 }
312
313 if self.pending_dedents > 0 {
315 self.pending_dedents -= 1;
316 return Some(LineToken::Dedent);
317 }
318
319 None
320 }
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
328pub enum LexerMode {
329 #[default]
330 Declarative, Imperative, }
333
334pub struct Lexer<'a> {
335 words: Vec<WordItem>,
336 pos: usize,
337 lexicon: Lexicon,
338 interner: &'a mut Interner,
339 input_len: usize,
340 in_let_context: bool,
341 mode: LexerMode,
342 source: String,
343 escape_body_ranges: Vec<(usize, usize)>,
345}
346
347struct WordItem {
348 word: String,
349 trailing_punct: Option<char>,
350 start: usize,
351 end: usize,
352 punct_pos: Option<usize>,
353}
354
355impl<'a> Lexer<'a> {
356 pub fn new(input: &str, interner: &'a mut Interner) -> Self {
380 let escape_ranges = Self::find_escape_block_ranges(input);
381 let escape_body_ranges: Vec<(usize, usize)> = escape_ranges.iter()
382 .map(|(_, end, content_start, _)| (*content_start, *end))
383 .collect();
384 let words = Self::split_into_words(input, &escape_ranges);
385 let input_len = input.len();
386
387 Lexer {
388 words,
389 pos: 0,
390 lexicon: Lexicon::new(),
391 interner,
392 input_len,
393 in_let_context: false,
394 mode: LexerMode::Declarative,
395 source: input.to_string(),
396 escape_body_ranges,
397 }
398 }
399
400 fn find_escape_block_ranges(source: &str) -> Vec<(usize, usize, usize, String)> {
405 let mut ranges = Vec::new();
406 let lines: Vec<&str> = source.split('\n').collect();
407 let mut line_starts: Vec<usize> = Vec::with_capacity(lines.len());
408 let mut pos = 0;
409 for line in &lines {
410 line_starts.push(pos);
411 pos += line.len() + 1; }
413
414 let mut i = 0;
415 while i < lines.len() {
416 let trimmed = lines[i].trim();
417 let lower = trimmed.to_lowercase();
421 if lower == "escape to rust:" ||
422 lower.ends_with(" escape to rust:") ||
423 (lower.starts_with("escape to ") && lower.ends_with(':'))
424 {
425 let header_indent = Self::measure_indent_static(lines[i]);
427 i += 1;
428
429 let mut body_start_line = i;
431 while body_start_line < lines.len() && lines[body_start_line].trim().is_empty() {
432 body_start_line += 1;
433 }
434
435 if body_start_line >= lines.len() {
436 continue;
438 }
439
440 let base_indent = Self::measure_indent_static(lines[body_start_line]);
441 if base_indent <= header_indent {
442 continue;
444 }
445
446 let body_byte_start = line_starts[body_start_line];
448 let mut body_end_line = body_start_line;
449 let mut code_lines: Vec<String> = Vec::new();
450
451 let mut j = body_start_line;
452 while j < lines.len() {
453 let line = lines[j];
454 if line.trim().is_empty() {
455 code_lines.push(String::new());
457 body_end_line = j;
458 j += 1;
459 continue;
460 }
461 let line_indent = Self::measure_indent_static(line);
462 if line_indent < base_indent {
463 break;
464 }
465 let stripped = Self::strip_indent(line, base_indent);
467 code_lines.push(stripped);
468 body_end_line = j;
469 j += 1;
470 }
471
472 while code_lines.last().map_or(false, |l| l.is_empty()) {
474 code_lines.pop();
475 }
476
477 if !code_lines.is_empty() {
478 let body_byte_end = if body_end_line + 1 < lines.len() {
479 line_starts[body_end_line + 1]
480 } else {
481 source.len()
482 };
483 let content_start = body_byte_start + Self::leading_whitespace_bytes(lines[body_start_line]);
485 let raw_code = code_lines.join("\n");
486 ranges.push((body_byte_start, body_byte_end, content_start, raw_code));
487 }
488
489 i = j;
490 } else {
491 i += 1;
492 }
493 }
494
495 ranges
496 }
497
498 fn leading_whitespace_bytes(line: &str) -> usize {
500 let mut count = 0;
501 for c in line.chars() {
502 match c {
503 ' ' | '\t' => count += c.len_utf8(),
504 _ => break,
505 }
506 }
507 count
508 }
509
510 fn measure_indent_static(line: &str) -> usize {
512 let mut indent = 0;
513 for c in line.chars() {
514 match c {
515 ' ' => indent += 1,
516 '\t' => indent += 4,
517 _ => break,
518 }
519 }
520 indent
521 }
522
523 fn strip_indent(line: &str, count: usize) -> String {
525 let mut stripped = 0;
526 let mut byte_pos = 0;
527 for (i, c) in line.char_indices() {
528 if stripped >= count {
529 byte_pos = i;
530 break;
531 }
532 match c {
533 ' ' => { stripped += 1; byte_pos = i + 1; }
534 '\t' => { stripped += 4; byte_pos = i + 1; }
535 _ => { byte_pos = i; break; }
536 }
537 }
538 if stripped < count {
539 byte_pos = line.len();
540 }
541 line[byte_pos..].to_string()
542 }
543
544 fn split_into_words(input: &str, escape_ranges: &[(usize, usize, usize, String)]) -> Vec<WordItem> {
545 let mut items = Vec::new();
546 let mut current_word = String::new();
547 let mut word_start = 0;
548 let chars: Vec<char> = input.chars().collect();
549 let mut char_idx = 0;
550 let mut skip_count = 0;
551 let mut skip_to_byte: Option<usize> = None;
553
554 for (i, c) in input.char_indices() {
555 if skip_count > 0 {
556 skip_count -= 1;
557 char_idx += 1;
558 continue;
559 }
560 if let Some(end) = skip_to_byte {
562 if i < end {
563 char_idx += 1;
564 continue;
565 }
566 skip_to_byte = None;
567 word_start = i;
568 }
569 if let Some((_, end, content_start, raw_code)) = escape_ranges.iter().find(|(s, _, _, _)| i == *s) {
571 if !current_word.is_empty() {
573 items.push(WordItem {
574 word: std::mem::take(&mut current_word),
575 trailing_punct: None,
576 start: word_start,
577 end: i,
578 punct_pos: None,
579 });
580 }
581 items.push(WordItem {
584 word: format!("\x00ESC:{}", raw_code),
585 trailing_punct: None,
586 start: *content_start,
587 end: *end,
588 punct_pos: None,
589 });
590 skip_to_byte = Some(*end);
591 word_start = *end;
592 char_idx += 1;
593 continue;
594 }
595 let next_pos = i + c.len_utf8();
596 match c {
597 ' ' | '\t' | '\n' | '\r' => {
598 if !current_word.is_empty() {
599 items.push(WordItem {
600 word: std::mem::take(&mut current_word),
601 trailing_punct: None,
602 start: word_start,
603 end: i,
604 punct_pos: None,
605 });
606 }
607 word_start = next_pos;
608 }
609 '.' => {
610 let prev_is_digit = !current_word.is_empty()
612 && current_word.chars().last().map_or(false, |ch| ch.is_ascii_digit());
613 let next_is_digit = char_idx + 1 < chars.len()
614 && chars[char_idx + 1].is_ascii_digit();
615
616 if prev_is_digit && next_is_digit {
617 current_word.push(c);
619 } else {
620 if !current_word.is_empty() {
622 items.push(WordItem {
623 word: std::mem::take(&mut current_word),
624 trailing_punct: Some(c),
625 start: word_start,
626 end: i,
627 punct_pos: Some(i),
628 });
629 } else {
630 items.push(WordItem {
631 word: String::new(),
632 trailing_punct: Some(c),
633 start: i,
634 end: next_pos,
635 punct_pos: Some(i),
636 });
637 }
638 word_start = next_pos;
639 }
640 }
641 '#' => {
642 if char_idx + 1 < chars.len() && chars[char_idx + 1] == '#' {
644 if !current_word.is_empty() {
647 items.push(WordItem {
648 word: std::mem::take(&mut current_word),
649 trailing_punct: None,
650 start: word_start,
651 end: i,
652 punct_pos: None,
653 });
654 }
655 let header_start = i;
657 let mut j = char_idx + 2;
658 while j < chars.len() && (chars[j] == ' ' || chars[j] == '\t') {
659 j += 1;
660 }
661 let mut block_word = String::from("##");
663 while j < chars.len() && chars[j].is_alphabetic() {
664 block_word.push(chars[j]);
665 j += 1;
666 }
667 if block_word.len() > 2 {
668 items.push(WordItem {
669 word: block_word,
670 trailing_punct: None,
671 start: header_start,
672 end: header_start + (j - char_idx),
673 punct_pos: None,
674 });
675 }
676 skip_count = j - char_idx - 1;
677 word_start = header_start + (j - char_idx);
678 } else {
679 let mut look_ahead = char_idx + 1;
683 while look_ahead < chars.len() && chars[look_ahead] != '\n' {
684 skip_count += 1;
685 look_ahead += 1;
686 }
687 if !current_word.is_empty() {
688 items.push(WordItem {
689 word: std::mem::take(&mut current_word),
690 trailing_punct: None,
691 start: word_start,
692 end: i,
693 punct_pos: None,
694 });
695 }
696 word_start = look_ahead + 1; }
698 }
699 '"' => {
701 if !current_word.is_empty() {
703 items.push(WordItem {
704 word: std::mem::take(&mut current_word),
705 trailing_punct: None,
706 start: word_start,
707 end: i,
708 punct_pos: None,
709 });
710 }
711
712 let string_start = i;
714 let mut j = char_idx + 1;
715 let mut string_content = String::new();
716 while j < chars.len() && chars[j] != '"' {
717 if chars[j] == '\\' && j + 1 < chars.len() {
718 j += 1;
720 if j < chars.len() {
721 string_content.push(chars[j]);
722 }
723 } else {
724 string_content.push(chars[j]);
725 }
726 j += 1;
727 }
728
729 items.push(WordItem {
732 word: format!("\x00STR:{}", string_content),
733 trailing_punct: None,
734 start: string_start,
735 end: if j < chars.len() { j + 1 } else { j },
736 punct_pos: None,
737 });
738
739 if j < chars.len() {
741 skip_count = j - char_idx;
742 } else {
743 skip_count = j - char_idx - 1;
744 }
745 word_start = if j < chars.len() { j + 1 } else { j };
746 }
747 '`' => {
749 if !current_word.is_empty() {
751 items.push(WordItem {
752 word: std::mem::take(&mut current_word),
753 trailing_punct: None,
754 start: word_start,
755 end: i,
756 punct_pos: None,
757 });
758 }
759
760 let char_start = i;
762 let mut j = char_idx + 1;
763 let mut char_content = String::new();
764
765 if j < chars.len() {
766 if chars[j] == '\\' && j + 1 < chars.len() {
767 j += 1;
769 let escaped_char = match chars[j] {
770 'n' => '\n',
771 't' => '\t',
772 'r' => '\r',
773 '\\' => '\\',
774 '`' => '`',
775 '0' => '\0',
776 c => c,
777 };
778 char_content.push(escaped_char);
779 j += 1;
780 } else if chars[j] != '`' {
781 char_content.push(chars[j]);
783 j += 1;
784 }
785 }
786
787 if j < chars.len() && chars[j] == '`' {
789 j += 1; }
791
792 items.push(WordItem {
794 word: format!("\x00CHAR:{}", char_content),
795 trailing_punct: None,
796 start: char_start,
797 end: if j <= chars.len() { char_start + (j - char_idx) } else { char_start + 1 },
798 punct_pos: None,
799 });
800
801 if j > char_idx + 1 {
802 skip_count = j - char_idx - 1;
803 }
804 word_start = char_start + (j - char_idx);
805 }
806 '-' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '>' => {
808 if !current_word.is_empty() {
810 items.push(WordItem {
811 word: std::mem::take(&mut current_word),
812 trailing_punct: None,
813 start: word_start,
814 end: i,
815 punct_pos: None,
816 });
817 }
818 items.push(WordItem {
820 word: "->".to_string(),
821 trailing_punct: None,
822 start: i,
823 end: i + 2,
824 punct_pos: None,
825 });
826 skip_count = 1; word_start = i + 2;
828 }
829 '<' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
831 if !current_word.is_empty() {
832 items.push(WordItem {
833 word: std::mem::take(&mut current_word),
834 trailing_punct: None,
835 start: word_start,
836 end: i,
837 punct_pos: None,
838 });
839 }
840 items.push(WordItem {
841 word: "<=".to_string(),
842 trailing_punct: None,
843 start: i,
844 end: i + 2,
845 punct_pos: None,
846 });
847 skip_count = 1;
848 word_start = i + 2;
849 }
850 '>' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
852 if !current_word.is_empty() {
853 items.push(WordItem {
854 word: std::mem::take(&mut current_word),
855 trailing_punct: None,
856 start: word_start,
857 end: i,
858 punct_pos: None,
859 });
860 }
861 items.push(WordItem {
862 word: ">=".to_string(),
863 trailing_punct: None,
864 start: i,
865 end: i + 2,
866 punct_pos: None,
867 });
868 skip_count = 1;
869 word_start = i + 2;
870 }
871 '=' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
873 if !current_word.is_empty() {
874 items.push(WordItem {
875 word: std::mem::take(&mut current_word),
876 trailing_punct: None,
877 start: word_start,
878 end: i,
879 punct_pos: None,
880 });
881 }
882 items.push(WordItem {
883 word: "==".to_string(),
884 trailing_punct: None,
885 start: i,
886 end: i + 2,
887 punct_pos: None,
888 });
889 skip_count = 1;
890 word_start = i + 2;
891 }
892 '!' if char_idx + 1 < chars.len() && chars[char_idx + 1] == '=' => {
894 if !current_word.is_empty() {
895 items.push(WordItem {
896 word: std::mem::take(&mut current_word),
897 trailing_punct: None,
898 start: word_start,
899 end: i,
900 punct_pos: None,
901 });
902 }
903 items.push(WordItem {
904 word: "!=".to_string(),
905 trailing_punct: None,
906 start: i,
907 end: i + 2,
908 punct_pos: None,
909 });
910 skip_count = 1;
911 word_start = i + 2;
912 }
913 '-' if Self::is_date_hyphen(¤t_word, &chars, char_idx) => {
915 current_word.push(c);
917 }
918 ':' if Self::is_time_colon(¤t_word, &chars, char_idx) => {
920 current_word.push(c);
922 }
923 '(' | ')' | '[' | ']' | ',' | '?' | '!' | ':' | '+' | '-' | '*' | '/' | '%' | '<' | '>' | '=' => {
924 if !current_word.is_empty() {
925 items.push(WordItem {
926 word: std::mem::take(&mut current_word),
927 trailing_punct: Some(c),
928 start: word_start,
929 end: i,
930 punct_pos: Some(i),
931 });
932 } else {
933 items.push(WordItem {
934 word: String::new(),
935 trailing_punct: Some(c),
936 start: i,
937 end: next_pos,
938 punct_pos: Some(i),
939 });
940 }
941 word_start = next_pos;
942 }
943 '\'' => {
944 let remaining: String = chars[char_idx + 1..].iter().collect();
946 let remaining_lower = remaining.to_lowercase();
947
948 if remaining_lower.starts_with("t ") || remaining_lower.starts_with("t.") ||
949 remaining_lower.starts_with("t,") || remaining_lower == "t" ||
950 (char_idx + 1 < chars.len() && chars[char_idx + 1] == 't' &&
951 (char_idx + 2 >= chars.len() || !chars[char_idx + 2].is_alphabetic())) {
952 let word_lower = current_word.to_lowercase();
954 if word_lower == "don" || word_lower == "doesn" || word_lower == "didn" {
955 let base = if word_lower == "don" { "do" }
957 else if word_lower == "doesn" { "does" }
958 else { "did" };
959 items.push(WordItem {
960 word: base.to_string(),
961 trailing_punct: None,
962 start: word_start,
963 end: i,
964 punct_pos: None,
965 });
966 items.push(WordItem {
967 word: "not".to_string(),
968 trailing_punct: None,
969 start: i,
970 end: i + 2,
971 punct_pos: None,
972 });
973 current_word.clear();
974 word_start = next_pos + 1;
975 skip_count = 1;
976 } else if word_lower == "won" {
977 items.push(WordItem {
979 word: "will".to_string(),
980 trailing_punct: None,
981 start: word_start,
982 end: i,
983 punct_pos: None,
984 });
985 items.push(WordItem {
986 word: "not".to_string(),
987 trailing_punct: None,
988 start: i,
989 end: i + 2,
990 punct_pos: None,
991 });
992 current_word.clear();
993 word_start = next_pos + 1;
994 skip_count = 1;
995 } else if word_lower == "can" {
996 items.push(WordItem {
998 word: "cannot".to_string(),
999 trailing_punct: None,
1000 start: word_start,
1001 end: i + 2,
1002 punct_pos: None,
1003 });
1004 current_word.clear();
1005 word_start = next_pos + 1;
1006 skip_count = 1;
1007 } else {
1008 if !current_word.is_empty() {
1010 items.push(WordItem {
1011 word: std::mem::take(&mut current_word),
1012 trailing_punct: Some('\''),
1013 start: word_start,
1014 end: i,
1015 punct_pos: Some(i),
1016 });
1017 }
1018 word_start = next_pos;
1019 }
1020 } else {
1021 if !current_word.is_empty() {
1023 items.push(WordItem {
1024 word: std::mem::take(&mut current_word),
1025 trailing_punct: Some('\''),
1026 start: word_start,
1027 end: i,
1028 punct_pos: Some(i),
1029 });
1030 }
1031 word_start = next_pos;
1032 }
1033 }
1034 c if c.is_alphabetic() || c.is_ascii_digit() || (c == '.' && !current_word.is_empty() && current_word.chars().all(|ch| ch.is_ascii_digit())) || c == '_' => {
1035 if current_word.is_empty() {
1036 word_start = i;
1037 }
1038 current_word.push(c);
1039 }
1040 _ => {
1041 word_start = next_pos;
1042 }
1043 }
1044 char_idx += 1;
1045 }
1046
1047 if !current_word.is_empty() {
1048 items.push(WordItem {
1049 word: current_word,
1050 trailing_punct: None,
1051 start: word_start,
1052 end: input.len(),
1053 punct_pos: None,
1054 });
1055 }
1056
1057 items
1058 }
1059
1060 fn peek_word(&self, offset: usize) -> Option<&str> {
1061 self.words.get(self.pos + offset).map(|w| w.word.as_str())
1062 }
1063
1064 fn peek_sequence(&self, expected: &[&str]) -> bool {
1065 for (i, &exp) in expected.iter().enumerate() {
1066 match self.peek_word(i + 1) {
1067 Some(w) if w.to_lowercase() == exp => continue,
1068 _ => return false,
1069 }
1070 }
1071 true
1072 }
1073
1074 fn consume_words(&mut self, count: usize) {
1075 self.pos += count;
1076 }
1077
1078 pub fn tokenize(&mut self) -> Vec<Token> {
1089 let mut tokens = Vec::new();
1090
1091 while self.pos < self.words.len() {
1092 let item = &self.words[self.pos];
1093 let word = item.word.clone();
1094 let trailing_punct = item.trailing_punct;
1095 let word_start = item.start;
1096 let word_end = item.end;
1097 let punct_pos = item.punct_pos;
1098
1099 if word.is_empty() {
1100 if let Some(punct) = trailing_punct {
1101 let kind = match punct {
1102 '(' => TokenType::LParen,
1103 ')' => TokenType::RParen,
1104 '[' => TokenType::LBracket,
1105 ']' => TokenType::RBracket,
1106 ',' => TokenType::Comma,
1107 ':' => TokenType::Colon,
1108 '.' | '?' => {
1109 self.in_let_context = false;
1110 TokenType::Period
1111 }
1112 '!' => TokenType::Exclamation,
1113 '+' => TokenType::Plus,
1114 '-' => TokenType::Minus,
1115 '*' => TokenType::Star,
1116 '/' => TokenType::Slash,
1117 '%' => TokenType::Percent,
1118 '<' => TokenType::Lt,
1119 '>' => TokenType::Gt,
1120 '=' => TokenType::Assign,
1121 _ => {
1122 self.pos += 1;
1123 continue;
1124 }
1125 };
1126 let lexeme = self.interner.intern(&punct.to_string());
1127 let span = Span::new(word_start, word_end);
1128 tokens.push(Token::new(kind, lexeme, span));
1129 }
1130 self.pos += 1;
1131 continue;
1132 }
1133
1134 if word.starts_with("\x00STR:") {
1136 let content = &word[5..]; let sym = self.interner.intern(content);
1138 let span = Span::new(word_start, word_end);
1139 tokens.push(Token::new(TokenType::StringLiteral(sym), sym, span));
1140 self.pos += 1;
1141 continue;
1142 }
1143
1144 if word.starts_with("\x00CHAR:") {
1146 let content = &word[6..]; let sym = self.interner.intern(content);
1148 let span = Span::new(word_start, word_end);
1149 tokens.push(Token::new(TokenType::CharLiteral(sym), sym, span));
1150 self.pos += 1;
1151 continue;
1152 }
1153
1154 if word.starts_with("\x00ESC:") {
1156 let content = &word[5..]; let sym = self.interner.intern(content);
1158 let span = Span::new(word_start, word_end);
1159 tokens.push(Token::new(TokenType::EscapeBlock(sym), sym, span));
1160 self.pos += 1;
1161 continue;
1162 }
1163
1164 let kind = self.classify_with_lookahead(&word);
1165 let lexeme = self.interner.intern(&word);
1166 let span = Span::new(word_start, word_end);
1167 tokens.push(Token::new(kind, lexeme, span));
1168
1169 if let Some(punct) = trailing_punct {
1170 if punct == '\'' {
1171 if let Some(next_item) = self.words.get(self.pos + 1) {
1172 if next_item.word.to_lowercase() == "s" {
1173 let poss_lexeme = self.interner.intern("'s");
1174 let poss_start = punct_pos.unwrap_or(word_end);
1175 let poss_end = next_item.end;
1176 tokens.push(Token::new(TokenType::Possessive, poss_lexeme, Span::new(poss_start, poss_end)));
1177 self.pos += 1;
1178 if let Some(s_punct) = next_item.trailing_punct {
1179 let kind = match s_punct {
1180 '(' => TokenType::LParen,
1181 ')' => TokenType::RParen,
1182 '[' => TokenType::LBracket,
1183 ']' => TokenType::RBracket,
1184 ',' => TokenType::Comma,
1185 ':' => TokenType::Colon,
1186 '.' | '?' => TokenType::Period,
1187 '!' => TokenType::Exclamation,
1188 '+' => TokenType::Plus,
1189 '-' => TokenType::Minus,
1190 '*' => TokenType::Star,
1191 '/' => TokenType::Slash,
1192 '%' => TokenType::Percent,
1193 '<' => TokenType::Lt,
1194 '>' => TokenType::Gt,
1195 '=' => TokenType::Assign,
1196 _ => {
1197 self.pos += 1;
1198 continue;
1199 }
1200 };
1201 let s_punct_pos = next_item.punct_pos.unwrap_or(next_item.end);
1202 let lexeme = self.interner.intern(&s_punct.to_string());
1203 tokens.push(Token::new(kind, lexeme, Span::new(s_punct_pos, s_punct_pos + 1)));
1204 }
1205 self.pos += 1;
1206 continue;
1207 }
1208 }
1209 self.pos += 1;
1210 continue;
1211 }
1212
1213 let kind = match punct {
1214 '(' => TokenType::LParen,
1215 ')' => TokenType::RParen,
1216 '[' => TokenType::LBracket,
1217 ']' => TokenType::RBracket,
1218 ',' => TokenType::Comma,
1219 ':' => TokenType::Colon,
1220 '.' | '?' => {
1221 self.in_let_context = false;
1222 TokenType::Period
1223 }
1224 '!' => TokenType::Exclamation,
1225 '+' => TokenType::Plus,
1226 '-' => TokenType::Minus,
1227 '*' => TokenType::Star,
1228 '/' => TokenType::Slash,
1229 '%' => TokenType::Percent,
1230 '<' => TokenType::Lt,
1231 '>' => TokenType::Gt,
1232 '=' => TokenType::Assign,
1233 _ => {
1234 self.pos += 1;
1235 continue;
1236 }
1237 };
1238 let p_start = punct_pos.unwrap_or(word_end);
1239 let lexeme = self.interner.intern(&punct.to_string());
1240 tokens.push(Token::new(kind, lexeme, Span::new(p_start, p_start + 1)));
1241 }
1242
1243 self.pos += 1;
1244 }
1245
1246 let eof_lexeme = self.interner.intern("");
1247 let eof_span = Span::new(self.input_len, self.input_len);
1248 tokens.push(Token::new(TokenType::EOF, eof_lexeme, eof_span));
1249
1250 self.insert_indentation_tokens(tokens)
1251 }
1252
1253 fn insert_indentation_tokens(&mut self, tokens: Vec<Token>) -> Vec<Token> {
1258 let mut result = Vec::new();
1259 let empty_sym = self.interner.intern("");
1260
1261 let line_lexer = LineLexer::new(&self.source);
1263 let line_tokens: Vec<LineToken> = line_lexer.collect();
1264
1265 let mut structural_events: Vec<(usize, bool)> = Vec::new(); let mut pending_indents = 0usize;
1269 let mut pending_dedents = 0usize;
1270
1271 for line_token in &line_tokens {
1272 match line_token {
1273 LineToken::Indent => {
1274 pending_indents += 1;
1275 }
1276 LineToken::Dedent => {
1277 pending_dedents += 1;
1278 }
1279 LineToken::Content { start, .. } => {
1280 for _ in 0..pending_dedents {
1282 structural_events.push((*start, false)); }
1284 pending_dedents = 0;
1285
1286 for _ in 0..pending_indents {
1288 structural_events.push((*start, true)); }
1290 pending_indents = 0;
1291 }
1292 LineToken::Newline => {}
1293 }
1294 }
1295
1296 for _ in 0..pending_dedents {
1298 structural_events.push((self.input_len, false));
1299 }
1300
1301 if !self.escape_body_ranges.is_empty() {
1306 let mut filtered = Vec::new();
1310 for &(pos, is_indent) in &structural_events {
1311 let is_inside_escape_body = self.escape_body_ranges.iter().any(|(start, end)| {
1312 pos > *start && pos < *end
1314 });
1315 if !is_inside_escape_body {
1316 filtered.push((pos, is_indent));
1317 }
1318 }
1319 structural_events = filtered;
1320 }
1321
1322 structural_events.sort_by(|a, b| {
1324 if a.0 != b.0 {
1325 a.0.cmp(&b.0)
1326 } else {
1327 a.1.cmp(&b.1)
1329 }
1330 });
1331
1332 let mut event_idx = 0;
1337 let mut last_colon_pos: Option<usize> = None;
1338
1339 for token in tokens.iter() {
1340 let token_start = token.span.start;
1341
1342 while event_idx < structural_events.len() {
1344 let (event_pos, is_indent) = structural_events[event_idx];
1345
1346 if event_pos <= token_start {
1348 let span = if is_indent {
1349 Span::new(last_colon_pos.unwrap_or(event_pos), last_colon_pos.unwrap_or(event_pos))
1351 } else {
1352 Span::new(event_pos, event_pos)
1353 };
1354 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1355 result.push(Token::new(kind, empty_sym, span));
1356 event_idx += 1;
1357 } else {
1358 break;
1359 }
1360 }
1361
1362 result.push(token.clone());
1363
1364 if token.kind == TokenType::Colon && self.is_end_of_line(token.span.end) {
1366 last_colon_pos = Some(token.span.end);
1367 }
1368 }
1369
1370 while event_idx < structural_events.len() {
1372 let (event_pos, is_indent) = structural_events[event_idx];
1373 let span = Span::new(event_pos, event_pos);
1374 let kind = if is_indent { TokenType::Indent } else { TokenType::Dedent };
1375 result.push(Token::new(kind, empty_sym, span));
1376 event_idx += 1;
1377 }
1378
1379 let eof_pos = result.iter().position(|t| t.kind == TokenType::EOF);
1381 if let Some(pos) = eof_pos {
1382 let eof = result.remove(pos);
1383 result.push(eof);
1384 }
1385
1386 result
1387 }
1388
1389 fn is_end_of_line(&self, from_pos: usize) -> bool {
1391 let bytes = self.source.as_bytes();
1392 let mut pos = from_pos;
1393 while pos < bytes.len() {
1394 match bytes[pos] {
1395 b' ' | b'\t' => pos += 1,
1396 b'\n' => return true,
1397 _ => return false,
1398 }
1399 }
1400 true }
1402
1403 fn measure_next_line_indent(&self, from_pos: usize) -> Option<usize> {
1404 let bytes = self.source.as_bytes();
1405 let mut pos = from_pos;
1406
1407 while pos < bytes.len() && bytes[pos] != b'\n' {
1408 pos += 1;
1409 }
1410
1411 if pos >= bytes.len() {
1412 return None;
1413 }
1414
1415 pos += 1;
1416
1417 let mut indent = 0;
1418 while pos < bytes.len() {
1419 match bytes[pos] {
1420 b' ' => indent += 1,
1421 b'\t' => indent += 4,
1422 b'\n' => {
1423 indent = 0;
1424 }
1425 _ => break,
1426 }
1427 pos += 1;
1428 }
1429
1430 if pos >= bytes.len() {
1431 return None;
1432 }
1433
1434 Some(indent)
1435 }
1436
1437 fn word_to_number(word: &str) -> Option<u32> {
1438 lexicon::word_to_number(&word.to_lowercase())
1439 }
1440
1441 fn is_date_hyphen(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1447 let word_chars: Vec<char> = current_word.chars().collect();
1449
1450 if word_chars.len() == 4 && word_chars.iter().all(|c| c.is_ascii_digit()) {
1452 if char_idx + 5 < chars.len()
1454 && chars[char_idx + 1].is_ascii_digit()
1455 && chars[char_idx + 2].is_ascii_digit()
1456 && chars[char_idx + 3] == '-'
1457 && chars[char_idx + 4].is_ascii_digit()
1458 && chars[char_idx + 5].is_ascii_digit()
1459 {
1460 return true;
1461 }
1462 }
1463
1464 if word_chars.len() == 7
1466 && word_chars[0..4].iter().all(|c| c.is_ascii_digit())
1467 && word_chars[4] == '-'
1468 && word_chars[5..7].iter().all(|c| c.is_ascii_digit())
1469 {
1470 if char_idx + 2 < chars.len()
1472 && chars[char_idx + 1].is_ascii_digit()
1473 && chars[char_idx + 2].is_ascii_digit()
1474 {
1475 let next_not_digit = char_idx + 3 >= chars.len()
1477 || !chars[char_idx + 3].is_ascii_digit();
1478 if next_not_digit {
1479 return true;
1480 }
1481 }
1482 }
1483
1484 false
1485 }
1486
1487 fn is_time_colon(current_word: &str, chars: &[char], char_idx: usize) -> bool {
1493 let word_chars: Vec<char> = current_word.chars().collect();
1495 if word_chars.is_empty() || word_chars.len() > 2 {
1496 return false;
1497 }
1498 if !word_chars.iter().all(|c| c.is_ascii_digit()) {
1499 return false;
1500 }
1501
1502 if char_idx + 4 < chars.len()
1504 && chars[char_idx + 1].is_ascii_digit()
1505 && chars[char_idx + 2].is_ascii_digit()
1506 {
1507 let next_two: String = chars[char_idx + 3..char_idx + 5].iter().collect();
1509 let lower = next_two.to_lowercase();
1510 if lower == "am" || lower == "pm" {
1511 let after_suffix = char_idx + 5 >= chars.len()
1513 || !chars[char_idx + 5].is_alphabetic();
1514 if after_suffix {
1515 return true;
1516 }
1517 }
1518 }
1519
1520 false
1521 }
1522
1523 fn is_numeric_literal(word: &str) -> bool {
1524 if word.is_empty() {
1525 return false;
1526 }
1527 let chars: Vec<char> = word.chars().collect();
1528 let first = chars[0];
1529 if first.is_ascii_digit() {
1530 return true;
1532 }
1533 if let Some(underscore_pos) = word.rfind('_') {
1536 let before_underscore = &word[..underscore_pos];
1537 let after_underscore = &word[underscore_pos + 1..];
1538 let is_math_symbol = matches!(
1540 before_underscore.to_lowercase().as_str(),
1541 "aleph" | "omega" | "beth"
1542 );
1543 if is_math_symbol
1544 && !after_underscore.is_empty()
1545 && after_underscore.chars().all(|c| c.is_ascii_digit())
1546 {
1547 return true;
1548 }
1549 }
1550 false
1551 }
1552
1553 fn parse_duration_literal(word: &str) -> Option<(i64, &str)> {
1566 if word.is_empty() || !word.chars().next()?.is_ascii_digit() {
1567 return None;
1568 }
1569
1570 const SUFFIXES: &[(&str, i64)] = &[
1572 ("ns", 1),
1573 ("μs", 1_000),
1574 ("us", 1_000),
1575 ("ms", 1_000_000),
1576 ("sec", 1_000_000_000),
1577 ("s", 1_000_000_000),
1578 ("min", 60_000_000_000),
1579 ("hr", 3_600_000_000_000),
1580 ("h", 3_600_000_000_000),
1581 ];
1582
1583 for (suffix, multiplier) in SUFFIXES {
1585 if word.ends_with(suffix) {
1586 let num_part = &word[..word.len() - suffix.len()];
1587 let cleaned: String = num_part.chars().filter(|c| *c != '_').collect();
1589 if let Ok(n) = cleaned.parse::<i64>() {
1590 return Some((n.saturating_mul(*multiplier), *suffix));
1591 }
1592 }
1593 }
1594
1595 None
1596 }
1597
1598 fn parse_date_literal(word: &str) -> Option<i32> {
1603 if word.len() != 10 {
1605 return None;
1606 }
1607
1608 let bytes = word.as_bytes();
1609
1610 if bytes[4] != b'-' || bytes[7] != b'-' {
1612 return None;
1613 }
1614
1615 let year: i32 = word[0..4].parse().ok()?;
1617 let month: u32 = word[5..7].parse().ok()?;
1618 let day: u32 = word[8..10].parse().ok()?;
1619
1620 if month < 1 || month > 12 || day < 1 || day > 31 {
1622 return None;
1623 }
1624
1625 let y = if month <= 2 { year - 1 } else { year };
1628 let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
1629 let yoe = (y - era * 400) as u32;
1630 let m = month;
1631 let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + day - 1;
1632 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1633 let days = era * 146097 + doe as i32 - 719468;
1634
1635 Some(days)
1636 }
1637
1638 fn parse_time_literal(word: &str) -> Option<i64> {
1647 let lower = word.to_lowercase();
1648
1649 if lower == "noon" {
1651 return Some(12i64 * 3600 * 1_000_000_000);
1652 }
1653 if lower == "midnight" {
1654 return Some(0);
1655 }
1656
1657 let is_pm = lower.ends_with("pm");
1659 let is_am = lower.ends_with("am");
1660
1661 if !is_pm && !is_am {
1662 return None;
1663 }
1664
1665 let time_part = &lower[..lower.len() - 2];
1667
1668 let (hour, minute): (i64, i64) = if let Some(colon_idx) = time_part.find(':') {
1670 let hour_str = &time_part[..colon_idx];
1671 let min_str = &time_part[colon_idx + 1..];
1672 let h: i64 = hour_str.parse().ok()?;
1673 let m: i64 = min_str.parse().ok()?;
1674 (h, m)
1675 } else {
1676 let h: i64 = time_part.parse().ok()?;
1678 (h, 0)
1679 };
1680
1681 if hour < 1 || hour > 12 || minute < 0 || minute > 59 {
1683 return None;
1684 }
1685
1686 let hour_24 = if is_am {
1688 if hour == 12 { 0 } else { hour } } else {
1690 if hour == 12 { 12 } else { hour + 12 } };
1692
1693 let nanos = (hour_24 * 3600 + minute * 60) * 1_000_000_000;
1695 Some(nanos)
1696 }
1697
1698 fn classify_with_lookahead(&mut self, word: &str) -> TokenType {
1699 if word.starts_with("##") {
1701 let block_name = &word[2..];
1702 let block_type = match block_name.to_lowercase().as_str() {
1703 "theorem" => BlockType::Theorem,
1704 "main" => BlockType::Main,
1705 "definition" => BlockType::Definition,
1706 "proof" => BlockType::Proof,
1707 "example" => BlockType::Example,
1708 "logic" => BlockType::Logic,
1709 "note" => BlockType::Note,
1710 "to" => BlockType::Function, "a" | "an" => BlockType::TypeDef, "policy" => BlockType::Policy, "requires" => BlockType::Requires, _ => BlockType::Note, };
1716
1717 self.mode = match block_type {
1719 BlockType::Main | BlockType::Function => LexerMode::Imperative,
1720 _ => LexerMode::Declarative,
1721 };
1722
1723 return TokenType::BlockHeader { block_type };
1724 }
1725
1726 let lower = word.to_lowercase();
1727
1728 if lower == "each" && self.peek_sequence(&["other"]) {
1729 self.consume_words(1);
1730 return TokenType::Reciprocal;
1731 }
1732
1733 if lower == "to" {
1734 if let Some(next) = self.peek_word(1) {
1735 if self.is_verb_like(next) {
1736 return TokenType::To;
1737 }
1738 }
1739 let sym = self.interner.intern("to");
1740 return TokenType::Preposition(sym);
1741 }
1742
1743 if lower == "at" {
1744 if let Some(next) = self.peek_word(1) {
1745 let next_lower = next.to_lowercase();
1746 if next_lower == "least" {
1747 if let Some(num_word) = self.peek_word(2) {
1748 if let Some(n) = Self::word_to_number(num_word) {
1749 self.consume_words(2);
1750 return TokenType::AtLeast(n);
1751 }
1752 }
1753 }
1754 if next_lower == "most" {
1755 if let Some(num_word) = self.peek_word(2) {
1756 if let Some(n) = Self::word_to_number(num_word) {
1757 self.consume_words(2);
1758 return TokenType::AtMost(n);
1759 }
1760 }
1761 }
1762 }
1763 }
1764
1765 if let Some(n) = Self::word_to_number(&lower) {
1766 return TokenType::Cardinal(n);
1767 }
1768
1769 if let Some((nanos, unit)) = Self::parse_duration_literal(word) {
1771 let unit_sym = self.interner.intern(unit);
1772 return TokenType::DurationLiteral {
1773 nanos,
1774 original_unit: unit_sym,
1775 };
1776 }
1777
1778 if let Some(days) = Self::parse_date_literal(word) {
1780 return TokenType::DateLiteral { days };
1781 }
1782
1783 if let Some(nanos_from_midnight) = Self::parse_time_literal(word) {
1785 return TokenType::TimeLiteral { nanos_from_midnight };
1786 }
1787
1788 if Self::is_numeric_literal(word) {
1789 let sym = self.interner.intern(word);
1790 return TokenType::Number(sym);
1791 }
1792
1793 if lower == "if" && self.peek_sequence(&["and", "only", "if"]) {
1794 self.consume_words(3);
1795 return TokenType::Iff;
1796 }
1797
1798 if lower == "is" {
1799 if self.peek_sequence(&["equal", "to"]) {
1800 self.consume_words(2);
1801 return TokenType::Identity;
1802 }
1803 if self.peek_sequence(&["identical", "to"]) {
1804 self.consume_words(2);
1805 return TokenType::Identity;
1806 }
1807 }
1808
1809 if (lower == "a" || lower == "an") && word.chars().next().unwrap().is_uppercase() {
1810 if let Some(next) = self.peek_word(1) {
1813 let next_lower = next.to_lowercase();
1814 let next_starts_lowercase = next.chars().next().map(|c| c.is_lowercase()).unwrap_or(false);
1815
1816 if matches!(next_lower.as_str(), "if" | "and" | "or" | "implies" | "iff") {
1818 let sym = self.interner.intern(word);
1819 return TokenType::ProperName(sym);
1820 }
1821
1822 let is_verb = self.lexicon.lookup_verb(&next_lower).is_some()
1827 && !lexicon::is_disambiguation_not_verb(&next_lower);
1828 let is_gerund = next_lower.ends_with("ing");
1829 let is_also_noun_or_adj = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1830 if is_verb && !is_gerund && !is_also_noun_or_adj {
1831 let sym = self.interner.intern(word);
1832 return TokenType::ProperName(sym);
1833 }
1834
1835 if let Some(third) = self.peek_word(2) {
1838 let third_lower = third.to_lowercase();
1839 if third_lower == "is" || third_lower == "are" || third_lower == "has" {
1841 return TokenType::Article(Definiteness::Indefinite);
1842 }
1843 }
1844
1845 let is_content_word = self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower);
1849 if is_content_word || next_starts_lowercase {
1850 return TokenType::Article(Definiteness::Indefinite);
1851 }
1852 }
1853 let sym = self.interner.intern(word);
1854 return TokenType::ProperName(sym);
1855 }
1856
1857 self.classify_word(word)
1858 }
1859
1860 fn is_noun_like(&self, word: &str) -> bool {
1861 if lexicon::is_noun_pattern(word) || lexicon::is_common_noun(word) {
1862 return true;
1863 }
1864 if word.ends_with("er") || word.ends_with("ian") || word.ends_with("ist") {
1865 return true;
1866 }
1867 false
1868 }
1869
1870 fn is_adjective_like(&self, word: &str) -> bool {
1871 lexicon::is_adjective(word) || lexicon::is_non_intersective(word)
1872 }
1873
1874 fn classify_word(&mut self, word: &str) -> TokenType {
1875 let lower = word.to_lowercase();
1876 let first_char = word.chars().next().unwrap();
1877
1878 if lower == "that" {
1881 if let Some(next) = self.peek_word(1) {
1882 let next_lower = next.to_lowercase();
1883 if self.is_noun_like(&next_lower) || self.is_adjective_like(&next_lower) {
1884 return TokenType::Article(Definiteness::Distal);
1885 }
1886 }
1887 }
1888
1889 if word == "->" {
1891 return TokenType::Arrow;
1892 }
1893
1894 if word == "<=" {
1896 return TokenType::LtEq;
1897 }
1898 if word == ">=" {
1899 return TokenType::GtEq;
1900 }
1901 if word == "==" {
1902 return TokenType::EqEq;
1903 }
1904 if word == "!=" {
1905 return TokenType::NotEq;
1906 }
1907 if word == "<" {
1908 return TokenType::Lt;
1909 }
1910 if word == ">" {
1911 return TokenType::Gt;
1912 }
1913 if word == "=" {
1915 return TokenType::Assign;
1916 }
1917
1918 if let Some(kind) = lexicon::lookup_keyword(&lower) {
1919 return kind;
1920 }
1921
1922 if let Some(kind) = lexicon::lookup_pronoun(&lower) {
1923 return kind;
1924 }
1925
1926 if let Some(def) = lexicon::lookup_article(&lower) {
1927 return TokenType::Article(def);
1928 }
1929
1930 if let Some(time) = lexicon::lookup_auxiliary(&lower) {
1931 return TokenType::Auxiliary(time);
1932 }
1933
1934 match lower.as_str() {
1936 "call" => return TokenType::Call,
1937 "in" if self.mode == LexerMode::Imperative => return TokenType::In,
1938 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
1940 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
1942 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
1944 "before" => return TokenType::Before,
1946 _ => {}
1947 }
1948
1949 if lexicon::is_preposition(&lower) {
1950 let sym = self.interner.intern(&lower);
1951 return TokenType::Preposition(sym);
1952 }
1953
1954 match lower.as_str() {
1955 "equals" => return TokenType::Equals,
1956 "item" => return TokenType::Item,
1957 "items" => return TokenType::Items,
1958 "mut" if self.mode == LexerMode::Imperative => return TokenType::Mut,
1960 "let" => {
1961 self.in_let_context = true;
1962 return TokenType::Let;
1963 }
1964 "set" => {
1965 if self.peek_word(1).map_or(false, |w| w.to_lowercase() == "of") {
1968 } else if self.mode == LexerMode::Imperative {
1970 return TokenType::Set;
1972 } else {
1973 for offset in 2..=5 {
1976 if self.peek_word(offset).map_or(false, |w| w.to_lowercase() == "to") {
1977 return TokenType::Set;
1978 }
1979 }
1980 }
1981 }
1982 "return" => return TokenType::Return,
1983 "be" if self.in_let_context => {
1984 self.in_let_context = false;
1985 return TokenType::Be;
1986 }
1987 "while" => return TokenType::While,
1988 "assert" => return TokenType::Assert,
1989 "trust" => return TokenType::Trust,
1990 "check" => return TokenType::Check,
1991 "given" if self.mode == LexerMode::Declarative => return TokenType::Given,
1993 "prove" if self.mode == LexerMode::Declarative => return TokenType::Prove,
1994 "auto" if self.mode == LexerMode::Declarative => return TokenType::Auto,
1995 "listen" if self.mode == LexerMode::Imperative => return TokenType::Listen,
1997 "connect" if self.mode == LexerMode::Imperative => return TokenType::NetConnect,
1998 "sleep" if self.mode == LexerMode::Imperative => return TokenType::Sleep,
1999 "sync" if self.mode == LexerMode::Imperative => return TokenType::Sync,
2001 "mount" if self.mode == LexerMode::Imperative => return TokenType::Mount,
2003 "persistent" => return TokenType::Persistent, "combined" if self.mode == LexerMode::Imperative => return TokenType::Combined,
2005 "launch" if self.mode == LexerMode::Imperative => return TokenType::Launch,
2009 "task" if self.mode == LexerMode::Imperative => return TokenType::Task,
2010 "pipe" if self.mode == LexerMode::Imperative => return TokenType::Pipe,
2011 "receive" if self.mode == LexerMode::Imperative => return TokenType::Receive,
2012 "stop" if self.mode == LexerMode::Imperative => return TokenType::Stop,
2013 "try" if self.mode == LexerMode::Imperative => return TokenType::Try,
2014 "into" if self.mode == LexerMode::Imperative => return TokenType::Into,
2015 "native" => return TokenType::Native,
2016 "escape" if self.mode == LexerMode::Imperative => return TokenType::Escape,
2017 "from" => return TokenType::From,
2018 "otherwise" => return TokenType::Otherwise,
2019 "else" => return TokenType::Else,
2021 "elif" => return TokenType::Elif,
2022 "either" if self.mode == LexerMode::Declarative => return TokenType::Either,
2024 "inspect" if self.mode == LexerMode::Imperative => return TokenType::Inspect,
2026 "new" if self.mode == LexerMode::Imperative => return TokenType::New,
2028 "give" if self.mode == LexerMode::Imperative => return TokenType::Give,
2031 "show" if self.mode == LexerMode::Imperative => return TokenType::Show,
2032 "push" if self.mode == LexerMode::Imperative => return TokenType::Push,
2034 "pop" if self.mode == LexerMode::Imperative => return TokenType::Pop,
2035 "copy" if self.mode == LexerMode::Imperative => return TokenType::Copy,
2036 "through" if self.mode == LexerMode::Imperative => return TokenType::Through,
2037 "length" if self.mode == LexerMode::Imperative => return TokenType::Length,
2038 "at" if self.mode == LexerMode::Imperative => return TokenType::At,
2039 "add" if self.mode == LexerMode::Imperative => return TokenType::Add,
2041 "remove" if self.mode == LexerMode::Imperative => return TokenType::Remove,
2042 "contains" if self.mode == LexerMode::Imperative => return TokenType::Contains,
2043 "union" if self.mode == LexerMode::Imperative => return TokenType::Union,
2044 "intersection" if self.mode == LexerMode::Imperative => return TokenType::Intersection,
2045 "inside" if self.mode == LexerMode::Imperative => return TokenType::Inside,
2047 "zone" if self.mode == LexerMode::Imperative => return TokenType::Zone,
2048 "called" if self.mode == LexerMode::Imperative => return TokenType::Called,
2049 "size" if self.mode == LexerMode::Imperative => return TokenType::Size,
2050 "mapped" if self.mode == LexerMode::Imperative => return TokenType::Mapped,
2051 "attempt" if self.mode == LexerMode::Imperative => return TokenType::Attempt,
2053 "following" if self.mode == LexerMode::Imperative => return TokenType::Following,
2054 "simultaneously" if self.mode == LexerMode::Imperative => return TokenType::Simultaneously,
2055 "read" if self.mode == LexerMode::Imperative => return TokenType::Read,
2057 "write" if self.mode == LexerMode::Imperative => return TokenType::Write,
2058 "console" if self.mode == LexerMode::Imperative => return TokenType::Console,
2059 "file" if self.mode == LexerMode::Imperative => return TokenType::File,
2060 "spawn" if self.mode == LexerMode::Imperative => return TokenType::Spawn,
2062 "send" if self.mode == LexerMode::Imperative => return TokenType::Send,
2063 "await" if self.mode == LexerMode::Imperative => return TokenType::Await,
2064 "portable" => return TokenType::Portable,
2066 "manifest" if self.mode == LexerMode::Imperative => return TokenType::Manifest,
2068 "chunk" if self.mode == LexerMode::Imperative => return TokenType::Chunk,
2069 "shared" => return TokenType::Shared, "merge" if self.mode == LexerMode::Imperative => return TokenType::Merge,
2072 "increase" if self.mode == LexerMode::Imperative => return TokenType::Increase,
2073 "decrease" if self.mode == LexerMode::Imperative => return TokenType::Decrease,
2075 "append" if self.mode == LexerMode::Imperative => return TokenType::Append,
2076 "resolve" if self.mode == LexerMode::Imperative => return TokenType::Resolve,
2077 "values" if self.mode == LexerMode::Imperative => return TokenType::Values,
2078 "tally" => return TokenType::Tally,
2080 "sharedset" => return TokenType::SharedSet,
2081 "sharedsequence" => return TokenType::SharedSequence,
2082 "collaborativesequence" => return TokenType::CollaborativeSequence,
2083 "sharedmap" => return TokenType::SharedMap,
2084 "divergent" => return TokenType::Divergent,
2085 "removewins" => return TokenType::RemoveWins,
2086 "addwins" => return TokenType::AddWins,
2087 "yata" => return TokenType::YATA,
2088 "day" | "days" => return TokenType::CalendarUnit(CalendarUnit::Day),
2090 "week" | "weeks" => return TokenType::CalendarUnit(CalendarUnit::Week),
2091 "month" | "months" => return TokenType::CalendarUnit(CalendarUnit::Month),
2092 "year" | "years" => return TokenType::CalendarUnit(CalendarUnit::Year),
2093 "ago" => return TokenType::Ago,
2095 "hence" => return TokenType::Hence,
2096 "if" => return TokenType::If,
2097 "only" => return TokenType::Focus(FocusKind::Only),
2098 "even" => return TokenType::Focus(FocusKind::Even),
2099 "just" if self.peek_word(1).map_or(false, |w| {
2100 !self.is_verb_like(w) || w.to_lowercase() == "john" || w.chars().next().map_or(false, |c| c.is_uppercase())
2101 }) => return TokenType::Focus(FocusKind::Just),
2102 "much" => return TokenType::Measure(MeasureKind::Much),
2103 "little" => return TokenType::Measure(MeasureKind::Little),
2104 _ => {}
2105 }
2106
2107 if lexicon::is_scopal_adverb(&lower) {
2108 let sym = self.interner.intern(&Self::capitalize(&lower));
2109 return TokenType::ScopalAdverb(sym);
2110 }
2111
2112 if lexicon::is_temporal_adverb(&lower) {
2113 let sym = self.interner.intern(&Self::capitalize(&lower));
2114 return TokenType::TemporalAdverb(sym);
2115 }
2116
2117 if lexicon::is_non_intersective(&lower) {
2118 let sym = self.interner.intern(&Self::capitalize(&lower));
2119 return TokenType::NonIntersectiveAdjective(sym);
2120 }
2121
2122 if lexicon::is_adverb(&lower) {
2123 let sym = self.interner.intern(&Self::capitalize(&lower));
2124 return TokenType::Adverb(sym);
2125 }
2126 if lower.ends_with("ly") && !lexicon::is_not_adverb(&lower) && lower.len() > 4 {
2127 let sym = self.interner.intern(&Self::capitalize(&lower));
2128 return TokenType::Adverb(sym);
2129 }
2130
2131 if let Some(base) = self.try_parse_superlative(&lower) {
2132 let sym = self.interner.intern(&base);
2133 return TokenType::Superlative(sym);
2134 }
2135
2136 let irregular_comparative = match lower.as_str() {
2138 "less" => Some("Little"),
2139 "more" => Some("Much"),
2140 "better" => Some("Good"),
2141 "worse" => Some("Bad"),
2142 _ => None,
2143 };
2144 if let Some(base) = irregular_comparative {
2145 let sym = self.interner.intern(base);
2146 return TokenType::Comparative(sym);
2147 }
2148
2149 if let Some(base) = self.try_parse_comparative(&lower) {
2150 let sym = self.interner.intern(&base);
2151 return TokenType::Comparative(sym);
2152 }
2153
2154 if lexicon::is_performative(&lower) {
2155 let sym = self.interner.intern(&Self::capitalize(&lower));
2156 return TokenType::Performative(sym);
2157 }
2158
2159 if lexicon::is_base_verb_early(&lower) {
2160 let sym = self.interner.intern(&Self::capitalize(&lower));
2161 let class = lexicon::lookup_verb_class(&lower);
2162 return TokenType::Verb {
2163 lemma: sym,
2164 time: Time::Present,
2165 aspect: Aspect::Simple,
2166 class,
2167 };
2168 }
2169
2170 if lower.ends_with("ing") && lower.len() > 4 {
2173 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2174 let sym = self.interner.intern(&entry.lemma);
2175 return TokenType::Verb {
2176 lemma: sym,
2177 time: entry.time,
2178 aspect: entry.aspect,
2179 class: entry.class,
2180 };
2181 }
2182 }
2183
2184 if first_char.is_uppercase() {
2185 if let Some(next) = self.peek_word(1) {
2192 let next_lower = next.to_lowercase();
2193 let is_followed_by_verb = self.lexicon.lookup_verb(&next_lower).is_some()
2195 || matches!(next_lower.as_str(), "is" | "are" | "was" | "were" | "has" | "have" | "had");
2196
2197 if is_followed_by_verb {
2198 if let Some(analysis) = lexicon::analyze_word(&lower) {
2200 match analysis {
2201 lexicon::WordAnalysis::Noun(meta) if meta.number == lexicon::Number::Plural => {
2202 let sym = self.interner.intern(&lower);
2204 return TokenType::Noun(sym);
2205 }
2206 lexicon::WordAnalysis::DerivedNoun { number: lexicon::Number::Plural, .. } => {
2207 let sym = self.interner.intern(&lower);
2209 return TokenType::Noun(sym);
2210 }
2211 _ => {
2212 }
2215 }
2216 }
2217 }
2218 }
2219
2220 let sym = self.interner.intern(word);
2221 return TokenType::ProperName(sym);
2222 }
2223
2224 let verb_entry = self.lexicon.lookup_verb(&lower);
2225 let is_noun = lexicon::is_common_noun(&lower);
2226 let is_adj = self.is_adjective_like(&lower);
2227 let is_disambiguated = lexicon::is_disambiguation_not_verb(&lower);
2228
2229 if verb_entry.is_some() && (is_noun || is_adj) && !is_disambiguated {
2231 let entry = verb_entry.unwrap();
2232 let verb_token = TokenType::Verb {
2233 lemma: self.interner.intern(&entry.lemma),
2234 time: entry.time,
2235 aspect: entry.aspect,
2236 class: entry.class,
2237 };
2238
2239 let mut alternatives = Vec::new();
2240 if is_noun {
2241 alternatives.push(TokenType::Noun(self.interner.intern(word)));
2242 }
2243 if is_adj {
2244 alternatives.push(TokenType::Adjective(self.interner.intern(word)));
2245 }
2246
2247 return TokenType::Ambiguous {
2248 primary: Box::new(verb_token),
2249 alternatives,
2250 };
2251 }
2252
2253 if let Some(_) = &verb_entry {
2255 if is_disambiguated {
2256 let sym = self.interner.intern(word);
2257 if is_noun {
2258 return TokenType::Noun(sym);
2259 }
2260 return TokenType::Adjective(sym);
2261 }
2262 }
2263
2264 if let Some(entry) = verb_entry {
2266 let sym = self.interner.intern(&entry.lemma);
2267 return TokenType::Verb {
2268 lemma: sym,
2269 time: entry.time,
2270 aspect: entry.aspect,
2271 class: entry.class,
2272 };
2273 }
2274
2275 if is_noun {
2277 let sym = self.interner.intern(word);
2278 return TokenType::Noun(sym);
2279 }
2280
2281 if lexicon::is_base_verb(&lower) {
2282 let sym = self.interner.intern(&Self::capitalize(&lower));
2283 let class = lexicon::lookup_verb_class(&lower);
2284 return TokenType::Verb {
2285 lemma: sym,
2286 time: Time::Present,
2287 aspect: Aspect::Simple,
2288 class,
2289 };
2290 }
2291
2292 if lower.ends_with("ian")
2293 || lower.ends_with("er")
2294 || lower == "logic"
2295 || lower == "time"
2296 || lower == "men"
2297 || lower == "book"
2298 || lower == "house"
2299 || lower == "code"
2300 || lower == "user"
2301 {
2302 let sym = self.interner.intern(word);
2303 return TokenType::Noun(sym);
2304 }
2305
2306 if lexicon::is_particle(&lower) {
2307 let sym = self.interner.intern(&lower);
2308 return TokenType::Particle(sym);
2309 }
2310
2311 let sym = self.interner.intern(word);
2312 TokenType::Adjective(sym)
2313 }
2314
2315 fn capitalize(s: &str) -> String {
2316 let mut chars = s.chars();
2317 match chars.next() {
2318 None => String::new(),
2319 Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
2320 }
2321 }
2322
2323 pub fn is_collective_verb(lemma: &str) -> bool {
2324 lexicon::is_collective_verb(&lemma.to_lowercase())
2325 }
2326
2327 pub fn is_mixed_verb(lemma: &str) -> bool {
2328 lexicon::is_mixed_verb(&lemma.to_lowercase())
2329 }
2330
2331 pub fn is_distributive_verb(lemma: &str) -> bool {
2332 lexicon::is_distributive_verb(&lemma.to_lowercase())
2333 }
2334
2335 pub fn is_intensional_predicate(lemma: &str) -> bool {
2336 lexicon::is_intensional_predicate(&lemma.to_lowercase())
2337 }
2338
2339 pub fn is_opaque_verb(lemma: &str) -> bool {
2340 lexicon::is_opaque_verb(&lemma.to_lowercase())
2341 }
2342
2343 pub fn is_ditransitive_verb(lemma: &str) -> bool {
2344 lexicon::is_ditransitive_verb(&lemma.to_lowercase())
2345 }
2346
2347 fn is_verb_like(&self, word: &str) -> bool {
2348 let lower = word.to_lowercase();
2349 if lexicon::is_infinitive_verb(&lower) {
2350 return true;
2351 }
2352 if let Some(entry) = self.lexicon.lookup_verb(&lower) {
2353 return entry.lemma.len() > 0;
2354 }
2355 false
2356 }
2357
2358 pub fn is_subject_control_verb(lemma: &str) -> bool {
2359 lexicon::is_subject_control_verb(&lemma.to_lowercase())
2360 }
2361
2362 pub fn is_raising_verb(lemma: &str) -> bool {
2363 lexicon::is_raising_verb(&lemma.to_lowercase())
2364 }
2365
2366 pub fn is_object_control_verb(lemma: &str) -> bool {
2367 lexicon::is_object_control_verb(&lemma.to_lowercase())
2368 }
2369
2370 pub fn is_weather_verb(lemma: &str) -> bool {
2371 matches!(
2372 lemma.to_lowercase().as_str(),
2373 "rain" | "snow" | "hail" | "thunder" | "pour"
2374 )
2375 }
2376
2377 fn try_parse_superlative(&self, word: &str) -> Option<String> {
2378 if !word.ends_with("est") || word.len() < 5 {
2379 return None;
2380 }
2381
2382 let base = &word[..word.len() - 3];
2383
2384 if base.len() >= 2 {
2385 let chars: Vec<char> = base.chars().collect();
2386 let last = chars[chars.len() - 1];
2387 let second_last = chars[chars.len() - 2];
2388 if last == second_last && !"aeiou".contains(last) {
2389 let stem = &base[..base.len() - 1];
2390 if lexicon::is_gradable_adjective(stem) {
2391 return Some(Self::capitalize(stem));
2392 }
2393 }
2394 }
2395
2396 if base.ends_with("i") {
2397 let stem = format!("{}y", &base[..base.len() - 1]);
2398 if lexicon::is_gradable_adjective(&stem) {
2399 return Some(Self::capitalize(&stem));
2400 }
2401 }
2402
2403 if lexicon::is_gradable_adjective(base) {
2404 return Some(Self::capitalize(base));
2405 }
2406
2407 None
2408 }
2409
2410 fn try_parse_comparative(&self, word: &str) -> Option<String> {
2411 if !word.ends_with("er") || word.len() < 4 {
2412 return None;
2413 }
2414
2415 let base = &word[..word.len() - 2];
2416
2417 if base.len() >= 2 {
2418 let chars: Vec<char> = base.chars().collect();
2419 let last = chars[chars.len() - 1];
2420 let second_last = chars[chars.len() - 2];
2421 if last == second_last && !"aeiou".contains(last) {
2422 let stem = &base[..base.len() - 1];
2423 if lexicon::is_gradable_adjective(stem) {
2424 return Some(Self::capitalize(stem));
2425 }
2426 }
2427 }
2428
2429 if base.ends_with("i") {
2430 let stem = format!("{}y", &base[..base.len() - 1]);
2431 if lexicon::is_gradable_adjective(&stem) {
2432 return Some(Self::capitalize(&stem));
2433 }
2434 }
2435
2436 if lexicon::is_gradable_adjective(base) {
2437 return Some(Self::capitalize(base));
2438 }
2439
2440 None
2441 }
2442}
2443
2444#[cfg(test)]
2445mod tests {
2446 use super::*;
2447
2448 #[test]
2449 fn lexer_handles_apostrophe() {
2450 let mut interner = Interner::new();
2451 let mut lexer = Lexer::new("it's raining", &mut interner);
2452 let tokens = lexer.tokenize();
2453 assert!(!tokens.is_empty());
2454 }
2455
2456 #[test]
2457 fn lexer_handles_question_mark() {
2458 let mut interner = Interner::new();
2459 let mut lexer = Lexer::new("Is it raining?", &mut interner);
2460 let tokens = lexer.tokenize();
2461 assert!(!tokens.is_empty());
2462 }
2463
2464 #[test]
2465 fn ring_is_not_verb() {
2466 let mut interner = Interner::new();
2467 let mut lexer = Lexer::new("ring", &mut interner);
2468 let tokens = lexer.tokenize();
2469 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2470 }
2471
2472 #[test]
2473 fn debug_that_token() {
2474 let mut interner = Interner::new();
2475 let mut lexer = Lexer::new("The cat that runs", &mut interner);
2476 let tokens = lexer.tokenize();
2477 for (i, t) in tokens.iter().enumerate() {
2478 let lex = interner.resolve(t.lexeme);
2479 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2480 }
2481 let that_token = tokens.iter().find(|t| interner.resolve(t.lexeme) == "that");
2482 if let Some(t) = that_token {
2483 let check = std::mem::discriminant(&t.kind) == std::mem::discriminant(&TokenType::That);
2485 eprintln!("Discriminant check for That: {}", check);
2486 assert!(matches!(t.kind, TokenType::That), "'that' should be TokenType::That, got {:?}", t.kind);
2487 } else {
2488 panic!("No 'that' token found");
2489 }
2490 }
2491
2492 #[test]
2493 fn bus_is_not_verb() {
2494 let mut interner = Interner::new();
2495 let mut lexer = Lexer::new("bus", &mut interner);
2496 let tokens = lexer.tokenize();
2497 assert!(matches!(tokens[0].kind, TokenType::Noun(_)));
2498 }
2499
2500 #[test]
2501 fn lowercase_a_is_article() {
2502 let mut interner = Interner::new();
2503 let mut lexer = Lexer::new("a car", &mut interner);
2504 let tokens = lexer.tokenize();
2505 for (i, t) in tokens.iter().enumerate() {
2506 let lex = interner.resolve(t.lexeme);
2507 eprintln!("Token[{}]: {:?} -> {:?}", i, lex, t.kind);
2508 }
2509 assert_eq!(tokens[0].kind, TokenType::Article(Definiteness::Indefinite));
2510 assert!(matches!(tokens[1].kind, TokenType::Noun(_)), "Expected Noun, got {:?}", tokens[1].kind);
2511 }
2512
2513 #[test]
2514 fn open_is_ambiguous() {
2515 let mut interner = Interner::new();
2516 let mut lexer = Lexer::new("open", &mut interner);
2517 let tokens = lexer.tokenize();
2518
2519 if let TokenType::Ambiguous { primary, alternatives } = &tokens[0].kind {
2520 assert!(matches!(**primary, TokenType::Verb { .. }), "Primary should be Verb");
2521 assert!(alternatives.iter().any(|t| matches!(t, TokenType::Adjective(_))),
2522 "Should have Adjective alternative");
2523 } else {
2524 panic!("Expected Ambiguous token for 'open', got {:?}", tokens[0].kind);
2525 }
2526 }
2527
2528 #[test]
2529 fn basic_tokenization() {
2530 let mut interner = Interner::new();
2531 let mut lexer = Lexer::new("All men are mortal.", &mut interner);
2532 let tokens = lexer.tokenize();
2533 assert_eq!(tokens[0].kind, TokenType::All);
2534 assert!(matches!(tokens[1].kind, TokenType::Noun(_)));
2535 assert_eq!(tokens[2].kind, TokenType::Are);
2536 }
2537
2538 #[test]
2539 fn iff_tokenizes_as_single_token() {
2540 let mut interner = Interner::new();
2541 let mut lexer = Lexer::new("A if and only if B", &mut interner);
2542 let tokens = lexer.tokenize();
2543 assert!(
2544 tokens.iter().any(|t| t.kind == TokenType::Iff),
2545 "should contain Iff token: got {:?}",
2546 tokens
2547 );
2548 }
2549
2550 #[test]
2551 fn is_equal_to_tokenizes_as_identity() {
2552 let mut interner = Interner::new();
2553 let mut lexer = Lexer::new("Socrates is equal to Socrates", &mut interner);
2554 let tokens = lexer.tokenize();
2555 assert!(
2556 tokens.iter().any(|t| t.kind == TokenType::Identity),
2557 "should contain Identity token: got {:?}",
2558 tokens
2559 );
2560 }
2561
2562 #[test]
2563 fn is_identical_to_tokenizes_as_identity() {
2564 let mut interner = Interner::new();
2565 let mut lexer = Lexer::new("Clark is identical to Superman", &mut interner);
2566 let tokens = lexer.tokenize();
2567 assert!(
2568 tokens.iter().any(|t| t.kind == TokenType::Identity),
2569 "should contain Identity token: got {:?}",
2570 tokens
2571 );
2572 }
2573
2574 #[test]
2575 fn itself_tokenizes_as_reflexive() {
2576 let mut interner = Interner::new();
2577 let mut lexer = Lexer::new("John loves itself", &mut interner);
2578 let tokens = lexer.tokenize();
2579 assert!(
2580 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2581 "should contain Reflexive token: got {:?}",
2582 tokens
2583 );
2584 }
2585
2586 #[test]
2587 fn himself_tokenizes_as_reflexive() {
2588 let mut interner = Interner::new();
2589 let mut lexer = Lexer::new("John sees himself", &mut interner);
2590 let tokens = lexer.tokenize();
2591 assert!(
2592 tokens.iter().any(|t| t.kind == TokenType::Reflexive),
2593 "should contain Reflexive token: got {:?}",
2594 tokens
2595 );
2596 }
2597
2598 #[test]
2599 fn to_stay_tokenizes_correctly() {
2600 let mut interner = Interner::new();
2601 let mut lexer = Lexer::new("to stay", &mut interner);
2602 let tokens = lexer.tokenize();
2603 assert!(
2604 tokens.iter().any(|t| t.kind == TokenType::To),
2605 "should contain To token: got {:?}",
2606 tokens
2607 );
2608 assert!(
2609 tokens.iter().any(|t| matches!(t.kind, TokenType::Verb { .. })),
2610 "should contain Verb token for stay: got {:?}",
2611 tokens
2612 );
2613 }
2614
2615 #[test]
2616 fn possessive_apostrophe_s() {
2617 let mut interner = Interner::new();
2618 let mut lexer = Lexer::new("John's dog", &mut interner);
2619 let tokens = lexer.tokenize();
2620 assert!(
2621 tokens.iter().any(|t| t.kind == TokenType::Possessive),
2622 "should contain Possessive token: got {:?}",
2623 tokens
2624 );
2625 assert!(
2626 tokens.iter().any(|t| matches!(&t.kind, TokenType::ProperName(_))),
2627 "should have John as proper name: got {:?}",
2628 tokens
2629 );
2630 }
2631
2632 #[test]
2633 fn lexer_produces_valid_spans() {
2634 let input = "All men are mortal.";
2635 let mut interner = Interner::new();
2636 let mut lexer = Lexer::new(input, &mut interner);
2637 let tokens = lexer.tokenize();
2638
2639 assert_eq!(tokens[0].span.start, 0);
2641 assert_eq!(tokens[0].span.end, 3);
2642 assert_eq!(&input[tokens[0].span.start..tokens[0].span.end], "All");
2643
2644 assert_eq!(tokens[1].span.start, 4);
2646 assert_eq!(tokens[1].span.end, 7);
2647 assert_eq!(&input[tokens[1].span.start..tokens[1].span.end], "men");
2648
2649 assert_eq!(tokens[2].span.start, 8);
2651 assert_eq!(tokens[2].span.end, 11);
2652 assert_eq!(&input[tokens[2].span.start..tokens[2].span.end], "are");
2653
2654 assert_eq!(tokens[3].span.start, 12);
2656 assert_eq!(tokens[3].span.end, 18);
2657 assert_eq!(&input[tokens[3].span.start..tokens[3].span.end], "mortal");
2658
2659 assert_eq!(tokens[4].span.start, 18);
2661 assert_eq!(tokens[4].span.end, 19);
2662
2663 assert_eq!(tokens[5].span.start, input.len());
2665 assert_eq!(tokens[5].kind, TokenType::EOF);
2666 }
2667}