1mod token;
3mod cursor;
4mod unicode;
5
6pub use token::Token;
7pub use cursor::Cursor;
8
9pub struct Lexer<'a> {
10 source: &'a str,
11 pos: usize,
12}
13
14impl<'a> Lexer<'a> {
15 pub fn new(source: &'a str) -> Self {
16 Self { source, pos: 0 }
17 }
18
19 fn peek(&self) -> Option<char> {
20 self.source[self.pos..].chars().next()
21 }
22
23 fn peek_nth(&self, n: usize) -> Option<char> {
24 self.source[self.pos..].chars().nth(n)
25 }
26
27 fn advance(&mut self) -> Option<char> {
28 let ch = self.source[self.pos..].chars().next()?;
29 self.pos += ch.len_utf8();
30 Some(ch)
31 }
32
33 fn rest(&self) -> &str {
34 &self.source[self.pos..]
35 }
36
37 fn skip_whitespace_and_comments(&mut self) {
38 loop {
39 while matches!(self.peek(), Some(' ') | Some('\t') | Some('\n') | Some('\r') | Some('\x0C')) {
40 self.advance();
41 }
42 if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
43 while self.peek().map_or(false, |c| c != '\n') {
44 self.advance();
45 }
46 continue;
47 }
48 if self.peek() == Some('#') {
50 while self.peek().map_or(false, |c| c != '\n') {
51 self.advance();
52 }
53 continue;
54 }
55 break;
56 }
57 }
58
59 fn lex_string(&mut self) -> Token {
60 self.advance(); let mut s = std::string::String::new();
62 loop {
63 match self.advance() {
64 None => break,
65 Some('"') => break,
66 Some('\\') => match self.advance() {
67 Some('n') => s.push('\n'),
68 Some('t') => s.push('\t'),
69 Some('r') => s.push('\r'),
70 Some('"') => s.push('"'),
71 Some('\\') => s.push('\\'),
72 Some('0') => s.push('\0'),
73 Some(c) => { s.push('\\'); s.push(c); }
74 None => break,
75 },
76 Some(c) => s.push(c),
77 }
78 }
79 Token::String(s)
80 }
81
82 fn lex_number(&mut self) -> Token {
83 let start = self.pos;
84 let mut has_dot = false;
85 loop {
86 match self.peek() {
87 Some('0'..='9') => { self.advance(); }
88 Some('.') if !has_dot && matches!(self.peek_nth(1), Some('0'..='9')) => {
89 has_dot = true;
90 self.advance();
91 }
92 _ => break,
93 }
94 }
95 Token::Number(self.source[start..self.pos].to_string())
96 }
97
98 fn lex_word(&mut self) -> Token {
99 let start = self.pos;
100 while let Some(c) = self.peek() {
104 if c.is_alphanumeric() || c == '_' || is_unicode_combining(c) {
105 self.advance();
106 } else {
107 break;
108 }
109 }
110 let word = &self.source[start..self.pos];
111 Self::classify_word(word)
112 }
113
114 fn classify_word(word: &str) -> Token {
118 match word {
119 "bind" | "令" | "灵符" => Token::Bind,
121 "do" | "执" => Token::Do,
122 "fn" | "函" => Token::Fn,
123 "mod" | "核" => Token::Mod,
124 "type" | "符" => Token::Type,
125 "use" | "载" | "引" | "사용" | "使う" | "ใช้" | "นำเข้า" | "importar" | "nutzen" | "utiliser" => Token::Use,
127 "if" | "若" | "如" => Token::If,
128 "else" | "否则" | "否" => Token::Else,
129 "while" | "循" | "当" => Token::While,
130 "for" | "历" => Token::For,
131 "in" | "于" => Token::In,
132 "match" | "配" => Token::Match,
133 "return" | "归" => Token::Return,
134 "own" | "拥有" | "独" => Token::Own,
135 "lend" | "借" => Token::Lend,
136 "share" | "共享" | "共" => Token::Share,
137 "move" | "移动" | "移" => Token::Move,
138 "copy" | "复制" | "复" => Token::Copy,
139 "async" | "异步" | "异" => Token::Async,
140 "wait" | "等待" | "待" => Token::Wait,
141 "as" | "为" => Token::As,
142 "where" | "条件" => Token::Where,
143 "post" | "发布" | "出" => Token::Post,
144 "give" | "给" | "予" => Token::Give,
145 "fit" | "适合" => Token::Fit,
146 "form" | "形式" | "形" => Token::Form,
147 "choose" | "选择" | "选" => Token::Choose,
148 "can" | "能" => Token::Can,
149 "change" | "改变" | "变" => Token::Change,
150 "stop" | "停止" | "止" => Token::Stop,
151 "again" | "继续" => Token::Again,
152 "try" | "尝试" | "试" => Token::Try,
153 "sure" | "确定" | "确" => Token::Sure,
154 "maybe" | "可能" | "或" => Token::Maybe,
155 "pure" | "纯" => Token::Pure,
156 "spawn" | "生成" | "启" => Token::Spawn,
157 "ok" | "好" | "可" => Token::Ok,
158 "bad" | "坏" | "误" => Token::Bad,
159 "none" | "无" => Token::None,
160 "束縛" | "バ" => Token::Bind,
162 "実行" | "執" => Token::Do,
163 "関数" | "関" => Token::Fn,
164 "モジュール" | "模" => Token::Mod,
165 "もし" => Token::If,
166 "他" => Token::Else,
167 "間" | "一方" => Token::While,
168 "繰" | "ために" => Token::For,
169 "の中" => Token::In,
170 "一致" => Token::Match,
171 "戻る" | "帰る" => Token::Return,
172 "試す" => Token::Try,
173 "待つ" => Token::Wait,
174 "非同期" => Token::Async,
175 "起動" => Token::Spawn,
176 "止まれ" | "停め" => Token::Stop,
177 "継続" => Token::Again,
178 "true" | "真" => Token::Bool(true),
180 "false" | "假" | "偽" => Token::Bool(false),
181 "바인드" | "묶" => Token::Bind,
183 "실행" => Token::Do,
184 "함수" => Token::Fn,
185 "모듈" => Token::Mod,
186 "만약" | "조건" => Token::If,
187 "아니면" => Token::Else,
188 "동안" | "반복" => Token::While,
189 "위해" => Token::For,
190 "안에" => Token::In,
191 "매치" => Token::Match,
192 "반환" | "귀환" => Token::Return,
193 "시도" => Token::Try,
194 "기다려" => Token::Wait,
195 "비동기" => Token::Async,
196 "생성" => Token::Spawn,
197 "멈춤" => Token::Stop,
198 "계속" => Token::Again,
199 "참" => Token::Bool(true),
200 "거짓" => Token::Bool(false),
201 "связать" => Token::Bind, "сделать" => Token::Do, "если" => Token::If,
203 "иначе" => Token::Else, "пока" => Token::While, "для" => Token::For,
204 "вернуть" => Token::Return,
205 "ผูก" => Token::Bind, "ทำ" => Token::Do,
207 "ฟังก์ชัน" => Token::Fn, "โมดูล" => Token::Mod,
208 "ถ้า" => Token::If, "มิฉะนั้น" => Token::Else,
209 "ขณะที่" => Token::While, "สำหรับ" => Token::For,
210 "ใน" => Token::In, "จับคู่" => Token::Match,
211 "คืน" => Token::Return, "รอ" => Token::Wait,
212 "ไม่พร้อมกัน" => Token::Async,
213 "จริง" => Token::Bool(true), "เท็จ" => Token::Bool(false),
214 "बाँधो" => Token::Bind, "करो" => Token::Do,
216 "अगर" => Token::If, "नहींतो" => Token::Else,
217 "जबकि" => Token::While, "केलिए" => Token::For,
218 "वापस" => Token::Return,
219 "सत्य" => Token::Bool(true), "असत्य" => Token::Bool(false),
220 "ربط" => Token::Bind, "افعل" => Token::Do,
222 "إذا" => Token::If, "وإلا" => Token::Else,
223 "بينما" => Token::While, "لأجل" => Token::For,
224 "في" => Token::In, "أعد" => Token::Return,
225 "صحيح" => Token::Bool(true), "خطأ" => Token::Bool(false),
226 "enlazar" => Token::Bind, "hacer" => Token::Do,
228 "si" => Token::If, "sino" => Token::Else,
229 "mientras" => Token::While, "para" => Token::For,
230 "retornar" => Token::Return,
231 "verdadero" => Token::Bool(true), "falso" => Token::Bool(false),
232 "lier" => Token::Bind, "faire" => Token::Do,
234 "func" => Token::Fn, "module" => Token::Mod,
235 "sinon" => Token::Else, "tantque" => Token::While,
236 "retourner" => Token::Return,
237 "vrai" => Token::Bool(true), "faux" => Token::Bool(false),
238 "binden" => Token::Bind, "machen" => Token::Do,
240 "wenn" => Token::If, "sonst" => Token::Else,
241 "solange" => Token::While, "für" => Token::For,
242 "zurück" => Token::Return,
243 "wahr" => Token::Bool(true), "falsch" => Token::Bool(false),
244 "ligar" => Token::Bind, "fazer" => Token::Do,
246 "se" => Token::If, "senão" => Token::Else,
247 "enquanto" => Token::While,
248 "verdadeiro" => Token::Bool(true),
249 other => Token::Ident(other.to_string()),
251 }
252 }
253
254 pub fn next_token(&mut self) -> Option<Token> {
255 self.skip_whitespace_and_comments();
256 let ch = self.peek()?;
257
258 if ch == '"' { return Some(self.lex_string()); }
260
261 if ch.is_ascii_digit() { return Some(self.lex_number()); }
263
264 if ch.is_alphabetic() || ch == '_' || is_unicode_combining(ch) { return Some(self.lex_word()); }
266
267 let rest = self.rest();
269
270 if rest.starts_with("..") {
272 self.advance(); self.advance();
273 return Some(Token::DotDot);
274 }
275 if rest.starts_with("::") {
277 self.advance(); self.advance();
278 return Some(Token::ColonColon);
279 }
280 if rest.starts_with("==") {
282 self.advance(); self.advance();
283 return Some(Token::EqEq);
284 }
285 if rest.starts_with("!=") {
287 self.advance(); self.advance();
288 return Some(Token::Ne);
289 }
290 if rest.starts_with("<=") {
292 self.advance(); self.advance();
293 return Some(Token::Le);
294 }
295 if rest.starts_with(">=") {
297 self.advance(); self.advance();
298 return Some(Token::Ge);
299 }
300 if rest.starts_with("->") {
302 self.advance(); self.advance();
303 return Some(Token::Arrow);
304 }
305 if rest.starts_with("=>") {
307 self.advance(); self.advance();
308 return Some(Token::FatArrow);
309 }
310 if rest.starts_with("&&") {
312 self.advance(); self.advance();
313 return Some(Token::And);
314 }
315 if rest.starts_with("||") {
317 self.advance(); self.advance();
318 return Some(Token::Or);
319 }
320
321 self.advance();
323 Some(match ch {
324 '=' => Token::Eq,
325 '<' => Token::Lt,
326 '>' => Token::Gt,
327 '!' => Token::Not,
328 '+' => Token::Plus,
329 '-' => Token::Minus,
330 '*' => Token::Star,
331 '/' => Token::Slash,
332 '%' => Token::Percent,
333 '.' => Token::Dot,
334 '&' => Token::Ampersand,
335 '(' => Token::LParen,
336 ')' => Token::RParen,
337 '{' => Token::LBrace,
338 '}' => Token::RBrace,
339 '[' => Token::LBracket,
340 ']' => Token::RBracket,
341 ',' => Token::Comma,
342 ':' => Token::Colon,
343 ';' => Token::Semicolon,
344 '|' => Token::Or, c => Token::Error(c.to_string()),
346 })
347 }
348}
349
350fn is_unicode_combining(c: char) -> bool {
354 let cp = c as u32;
355 matches!(cp,
356 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05C7 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670 | 0x06D6..=0x06DC | 0x0730..=0x074A | 0x0816..=0x082D | 0x0900..=0x0903 | 0x093A..=0x094F | 0x0951..=0x0957 | 0x0962..=0x0963 | 0x0981..=0x0983 | 0x09BC | 0x09BE..=0x09C4 | 0x09C7..=0x09C8 | 0x09CB..=0x09CD | 0x0A01..=0x0A03 | 0x0A3C | 0x0A3E..=0x0A42 | 0x0B01..=0x0B03 | 0x0B3C..=0x0B4D | 0x0C00..=0x0C03 | 0x0C3E..=0x0C56 | 0x0D00..=0x0D03 | 0x0D3B..=0x0D4D | 0x0E31 | 0x0E34..=0x0E3A | 0x0E47..=0x0E4E | 0x0EB1 | 0x0EB4..=0x0EBC | 0x0EC8..=0x0ECD | 0x3099..=0x309A | 0xFE20..=0xFE2F )
392}