Skip to main content

oxihuman_core/
lexer_token_stream.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! Lexer token stream abstraction.
6//!
7//! Provides a position-tracking, peekable stream over a sequence of lexer tokens
8//! with support for mark/restore checkpointing, lookahead, and batch consumption.
9
10/// A basic token produced by a lexer.
11#[derive(Debug, Clone, PartialEq)]
12pub struct LexToken {
13    pub kind: LexTokenKind,
14    pub text: String,
15    pub line: usize,
16    pub col: usize,
17}
18
19impl LexToken {
20    pub fn new(kind: LexTokenKind, text: &str, line: usize, col: usize) -> Self {
21        Self {
22            kind,
23            text: text.to_string(),
24            line,
25            col,
26        }
27    }
28
29    pub fn is_eof(&self) -> bool {
30        self.kind == LexTokenKind::Eof
31    }
32}
33
34/// Categories of lexer tokens.
35#[derive(Debug, Clone, PartialEq)]
36pub enum LexTokenKind {
37    Word,
38    Number,
39    Punctuation,
40    Whitespace,
41    Newline,
42    Eof,
43    Custom(String),
44}
45
46/// A position-tracking stream over a `Vec<LexToken>`.
47#[derive(Debug, Clone)]
48pub struct LexerStream {
49    tokens: Vec<LexToken>,
50    pos: usize,
51    mark: Option<usize>,
52}
53
54impl LexerStream {
55    pub fn new(tokens: Vec<LexToken>) -> Self {
56        Self {
57            tokens,
58            pos: 0,
59            mark: None,
60        }
61    }
62
63    pub fn is_empty(&self) -> bool {
64        self.pos >= self.tokens.len()
65    }
66
67    pub fn remaining(&self) -> usize {
68        self.tokens.len().saturating_sub(self.pos)
69    }
70
71    pub fn current_pos(&self) -> usize {
72        self.pos
73    }
74
75    pub fn peek(&self) -> Option<&LexToken> {
76        self.tokens.get(self.pos)
77    }
78
79    pub fn peek_nth(&self, n: usize) -> Option<&LexToken> {
80        self.tokens.get(self.pos + n)
81    }
82
83    pub fn next_token(&mut self) -> Option<&LexToken> {
84        let t = self.tokens.get(self.pos);
85        if t.is_some() {
86            self.pos += 1;
87        }
88        t
89    }
90
91    pub fn skip(&mut self, n: usize) {
92        self.pos = (self.pos + n).min(self.tokens.len());
93    }
94
95    pub fn set_mark(&mut self) {
96        self.mark = Some(self.pos);
97    }
98
99    pub fn restore_mark(&mut self) {
100        if let Some(m) = self.mark {
101            self.pos = m;
102        }
103    }
104
105    pub fn clear_mark(&mut self) {
106        self.mark = None;
107    }
108
109    /// Consume tokens while the predicate holds.
110    pub fn consume_while(&mut self, pred: impl Fn(&LexToken) -> bool) -> Vec<&LexToken> {
111        let mut result = Vec::new();
112        while let Some(t) = self.tokens.get(self.pos) {
113            if pred(t) {
114                result.push(t);
115                self.pos += 1;
116            } else {
117                break;
118            }
119        }
120        result
121    }
122
123    pub fn total(&self) -> usize {
124        self.tokens.len()
125    }
126
127    /// Return all remaining tokens as a slice without consuming.
128    pub fn peek_rest(&self) -> &[LexToken] {
129        &self.tokens[self.pos..]
130    }
131}
132
133/// Build a `LexerStream` from a string by naive whitespace-based tokenization.
134pub fn lex_string(text: &str) -> LexerStream {
135    let mut tokens: Vec<LexToken> = Vec::new();
136    let mut line = 1usize;
137    let mut col = 1usize;
138
139    for word in text.split_whitespace() {
140        let kind = if word.chars().all(|c| c.is_ascii_digit()) {
141            LexTokenKind::Number
142        } else if word.chars().all(|c| c.is_alphanumeric() || c == '_') {
143            LexTokenKind::Word
144        } else {
145            LexTokenKind::Punctuation
146        };
147        tokens.push(LexToken::new(kind, word, line, col));
148        col += word.len() + 1;
149        if word.contains('\n') {
150            line += 1;
151            col = 1;
152        }
153    }
154    tokens.push(LexToken::new(LexTokenKind::Eof, "", line, col));
155    LexerStream::new(tokens)
156}
157
158/// Count tokens of a specific kind.
159pub fn count_tokens_of_kind(stream: &LexerStream, kind: &LexTokenKind) -> usize {
160    stream.tokens.iter().filter(|t| &t.kind == kind).count()
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn test_lex_produces_eof() {
169        let s = lex_string("hello world");
170        assert!(s.tokens.last().map(|t| t.is_eof()).unwrap_or(false));
171    }
172
173    #[test]
174    fn test_lex_word_count() {
175        let s = lex_string("one two three");
176        assert_eq!(count_tokens_of_kind(&s, &LexTokenKind::Word), 3);
177    }
178
179    #[test]
180    fn test_peek_does_not_advance() {
181        let mut s = lex_string("a b");
182        let first = s.peek().expect("should succeed").text.clone();
183        let _ = s.peek();
184        let next = s.next_token().expect("should succeed").text.clone();
185        assert_eq!(first, next);
186    }
187
188    #[test]
189    fn test_skip_advances() {
190        let mut s = lex_string("a b c");
191        s.skip(1);
192        assert_eq!(s.current_pos(), 1);
193    }
194
195    #[test]
196    fn test_mark_restore() {
197        let mut s = lex_string("a b c");
198        s.next_token();
199        s.set_mark();
200        s.next_token();
201        s.restore_mark();
202        assert_eq!(s.current_pos(), 1);
203    }
204
205    #[test]
206    fn test_remaining_decreases() {
207        let mut s = lex_string("a b");
208        let before = s.remaining();
209        s.next_token();
210        assert!(s.remaining() < before);
211    }
212
213    #[test]
214    fn test_peek_nth() {
215        let s = lex_string("x y z");
216        assert_eq!(s.peek_nth(1).map(|t| t.text.as_str()), Some("y"));
217    }
218
219    #[test]
220    fn test_consume_while() {
221        let mut s = lex_string("1 2 3 word");
222        let nums = s.consume_while(|t| t.kind == LexTokenKind::Number);
223        assert_eq!(nums.len(), 3);
224    }
225
226    #[test]
227    fn test_is_empty_after_all_consumed() {
228        let mut s = lex_string("a");
229        /* Consume word + EOF */
230        while !s.is_empty() {
231            s.next_token();
232        }
233        assert!(s.is_empty());
234    }
235}