1mod token;
3mod cursor;
4mod unicode;
5
6pub use token::Token;
7pub use cursor::Cursor;
8
9pub struct Lexer<'a> {
10 source: &'a str,
11 pos: usize,
12}
13
14impl<'a> Lexer<'a> {
15 pub fn new(source: &'a str) -> Self {
16 Self { source, pos: 0 }
17 }
18
19 fn peek(&self) -> Option<char> {
20 self.source[self.pos..].chars().next()
21 }
22
23 fn peek_nth(&self, n: usize) -> Option<char> {
24 self.source[self.pos..].chars().nth(n)
25 }
26
27 fn advance(&mut self) -> Option<char> {
28 let ch = self.source[self.pos..].chars().next()?;
29 self.pos += ch.len_utf8();
30 Some(ch)
31 }
32
33 fn rest(&self) -> &str {
34 &self.source[self.pos..]
35 }
36
37 fn skip_whitespace_and_comments(&mut self) {
38 loop {
39 while matches!(self.peek(), Some(' ') | Some('\t') | Some('\n') | Some('\r') | Some('\x0C')) {
40 self.advance();
41 }
42 if self.peek() == Some('/') && self.peek_nth(1) == Some('/') {
43 while self.peek().map_or(false, |c| c != '\n') {
44 self.advance();
45 }
46 continue;
47 }
48 if self.peek() == Some('#') {
50 while self.peek().map_or(false, |c| c != '\n') {
51 self.advance();
52 }
53 continue;
54 }
55 break;
56 }
57 }
58
59 fn lex_string(&mut self) -> Token {
60 self.advance(); let mut s = std::string::String::new();
62 loop {
63 match self.advance() {
64 None => break,
65 Some('"') => break,
66 Some('\\') => match self.advance() {
67 Some('n') => s.push('\n'),
68 Some('t') => s.push('\t'),
69 Some('r') => s.push('\r'),
70 Some('"') => s.push('"'),
71 Some('\\') => s.push('\\'),
72 Some('0') => s.push('\0'),
73 Some(c) => { s.push('\\'); s.push(c); }
74 None => break,
75 },
76 Some(c) => s.push(c),
77 }
78 }
79 Token::String(s)
80 }
81
82 fn lex_number(&mut self) -> Token {
83 let start = self.pos;
84 let mut has_dot = false;
85 loop {
86 match self.peek() {
87 Some('0'..='9') => { self.advance(); }
88 Some('.') if !has_dot && matches!(self.peek_nth(1), Some('0'..='9')) => {
89 has_dot = true;
90 self.advance();
91 }
92 _ => break,
93 }
94 }
95 Token::Number(self.source[start..self.pos].to_string())
96 }
97
98 fn lex_word(&mut self) -> Token {
99 let start = self.pos;
100 while let Some(c) = self.peek() {
104 if c.is_alphanumeric() || c == '_' || is_unicode_combining(c) {
105 self.advance();
106 } else {
107 break;
108 }
109 }
110 let word = &self.source[start..self.pos];
111 Self::classify_word(word)
112 }
113
114 fn classify_word(word: &str) -> Token {
118 match word {
119 "bind" | "令" | "灵符" => Token::Bind,
121 "do" | "执" => Token::Do,
122 "fn" | "函" => Token::Fn,
123 "mod" | "核" => Token::Mod,
124 "type" | "符" => Token::Type,
125 "use" | "载" | "引" | "사용" | "使う" | "ใช้" | "นำเข้า" | "importar" | "nutzen" | "utiliser" => Token::Use,
127 "if" | "若" | "如" => Token::If,
128 "else" | "否则" | "否" => Token::Else,
129 "while" | "循" | "当" => Token::While,
130 "for" | "历" => Token::For,
131 "in" | "于" => Token::In,
132 "match" | "配" => Token::Match,
133 "return" | "归" => Token::Return,
134 "own" | "拥有" | "独" => Token::Own,
135 "lend" | "借" => Token::Lend,
136 "share" | "共享" | "共" => Token::Share,
137 "move" | "移动" | "移" => Token::Move,
138 "copy" | "复制" | "复" => Token::Copy,
139 "async" | "异步" | "异" => Token::Async,
140 "wait" | "等待" | "待" => Token::Wait,
141 "as" | "为" => Token::As,
142 "where" | "条件" => Token::Where,
143 "post" | "发布" | "出" => Token::Post,
144 "give" | "给" | "予" => Token::Give,
145 "fit" | "适合" => Token::Fit,
146 "form" | "形式" | "形" | "構造" | "구조" | "โครงสร้าง" => Token::Form,
148 "choose" | "选择" | "选" | "選択" | "선택" | "เลือกแบบ" => Token::Choose,
150 "can" | "能" => Token::Can,
151 "change" | "改变" | "变" => Token::Change,
152 "stop" | "停止" | "止" => Token::Stop,
153 "again" | "继续" => Token::Again,
154 "try" | "尝试" | "试" => Token::Try,
155 "sure" | "确定" | "确" => Token::Sure,
156 "maybe" | "可能" | "或" => Token::Maybe,
157 "pure" | "纯" => Token::Pure,
158 "spawn" | "生成" | "启" => Token::Spawn,
159 "ok" | "好" | "可" => Token::Ok,
160 "bad" | "坏" | "误" => Token::Bad,
161 "none" | "无" => Token::None,
162 "束縛" | "バ" => Token::Bind,
164 "実行" | "執" => Token::Do,
165 "関数" | "関" => Token::Fn,
166 "モジュール" | "模" => Token::Mod,
167 "もし" => Token::If,
168 "他" => Token::Else,
169 "間" | "一方" => Token::While,
170 "繰" | "ために" => Token::For,
171 "の中" => Token::In,
172 "一致" => Token::Match,
173 "戻る" | "帰る" => Token::Return,
174 "試す" => Token::Try,
175 "待つ" => Token::Wait,
176 "非同期" => Token::Async,
177 "起動" => Token::Spawn,
178 "止まれ" | "停め" => Token::Stop,
179 "継続" => Token::Again,
180 "true" | "真" => Token::Bool(true),
182 "false" | "假" | "偽" => Token::Bool(false),
183 "바인드" | "묶" => Token::Bind,
185 "실행" => Token::Do,
186 "함수" => Token::Fn,
187 "모듈" => Token::Mod,
188 "만약" | "조건" => Token::If,
189 "아니면" => Token::Else,
190 "동안" | "반복" => Token::While,
191 "위해" => Token::For,
192 "안에" => Token::In,
193 "매치" => Token::Match,
194 "반환" | "귀환" => Token::Return,
195 "시도" => Token::Try,
196 "기다려" => Token::Wait,
197 "비동기" => Token::Async,
198 "생성" => Token::Spawn,
199 "멈춤" => Token::Stop,
200 "계속" => Token::Again,
201 "참" => Token::Bool(true),
202 "거짓" => Token::Bool(false),
203 "связать" => Token::Bind, "сделать" => Token::Do, "если" => Token::If,
205 "иначе" => Token::Else, "пока" => Token::While, "для" => Token::For,
206 "вернуть" => Token::Return,
207 "ผูก" => Token::Bind, "ทำ" => Token::Do,
209 "ฟังก์ชัน" => Token::Fn, "โมดูล" => Token::Mod,
210 "ถ้า" => Token::If, "มิฉะนั้น" => Token::Else,
211 "ขณะที่" => Token::While, "สำหรับ" => Token::For,
212 "ใน" => Token::In, "จับคู่" => Token::Match,
213 "คืน" => Token::Return, "รอ" => Token::Wait,
214 "ไม่พร้อมกัน" => Token::Async,
215 "จริง" => Token::Bool(true), "เท็จ" => Token::Bool(false),
216 "बाँधो" => Token::Bind, "करो" => Token::Do,
218 "अगर" => Token::If, "नहींतो" => Token::Else,
219 "जबकि" => Token::While, "केलिए" => Token::For,
220 "वापस" => Token::Return,
221 "सत्य" => Token::Bool(true), "असत्य" => Token::Bool(false),
222 "ربط" => Token::Bind, "افعل" => Token::Do,
224 "إذا" => Token::If, "وإلا" => Token::Else,
225 "بينما" => Token::While, "لأجل" => Token::For,
226 "في" => Token::In, "أعد" => Token::Return,
227 "صحيح" => Token::Bool(true), "خطأ" => Token::Bool(false),
228 "enlazar" => Token::Bind, "hacer" => Token::Do,
230 "si" => Token::If, "sino" => Token::Else,
231 "mientras" => Token::While, "para" => Token::For,
232 "retornar" => Token::Return,
233 "verdadero" => Token::Bool(true), "falso" => Token::Bool(false),
234 "lier" => Token::Bind, "faire" => Token::Do,
236 "func" => Token::Fn, "module" => Token::Mod,
237 "sinon" => Token::Else, "tantque" => Token::While,
238 "retourner" => Token::Return,
239 "vrai" => Token::Bool(true), "faux" => Token::Bool(false),
240 "binden" => Token::Bind, "machen" => Token::Do,
242 "wenn" => Token::If, "sonst" => Token::Else,
243 "solange" => Token::While, "für" => Token::For,
244 "zurück" => Token::Return,
245 "wahr" => Token::Bool(true), "falsch" => Token::Bool(false),
246 "ligar" => Token::Bind, "fazer" => Token::Do,
248 "se" => Token::If, "senão" => Token::Else,
249 "enquanto" => Token::While,
250 "verdadeiro" => Token::Bool(true),
251 other => Token::Ident(other.to_string()),
253 }
254 }
255
256 pub fn next_token(&mut self) -> Option<Token> {
257 self.skip_whitespace_and_comments();
258 let ch = self.peek()?;
259
260 if ch == '"' { return Some(self.lex_string()); }
262
263 if ch.is_ascii_digit() { return Some(self.lex_number()); }
265
266 if ch.is_alphabetic() || ch == '_' || is_unicode_combining(ch) { return Some(self.lex_word()); }
268
269 let rest = self.rest();
271
272 if rest.starts_with("..") {
274 self.advance(); self.advance();
275 return Some(Token::DotDot);
276 }
277 if rest.starts_with("::") {
279 self.advance(); self.advance();
280 return Some(Token::ColonColon);
281 }
282 if rest.starts_with("==") {
284 self.advance(); self.advance();
285 return Some(Token::EqEq);
286 }
287 if rest.starts_with("!=") {
289 self.advance(); self.advance();
290 return Some(Token::Ne);
291 }
292 if rest.starts_with("<=") {
294 self.advance(); self.advance();
295 return Some(Token::Le);
296 }
297 if rest.starts_with(">=") {
299 self.advance(); self.advance();
300 return Some(Token::Ge);
301 }
302 if rest.starts_with("->") {
304 self.advance(); self.advance();
305 return Some(Token::Arrow);
306 }
307 if rest.starts_with("=>") {
309 self.advance(); self.advance();
310 return Some(Token::FatArrow);
311 }
312 if rest.starts_with("&&") {
314 self.advance(); self.advance();
315 return Some(Token::And);
316 }
317 if rest.starts_with("||") {
319 self.advance(); self.advance();
320 return Some(Token::Or);
321 }
322
323 self.advance();
325 Some(match ch {
326 '=' => Token::Eq,
327 '<' => Token::Lt,
328 '>' => Token::Gt,
329 '!' => Token::Not,
330 '+' => Token::Plus,
331 '-' => Token::Minus,
332 '*' => Token::Star,
333 '/' => Token::Slash,
334 '%' => Token::Percent,
335 '.' => Token::Dot,
336 '&' => Token::Ampersand,
337 '(' => Token::LParen,
338 ')' => Token::RParen,
339 '{' => Token::LBrace,
340 '}' => Token::RBrace,
341 '[' => Token::LBracket,
342 ']' => Token::RBracket,
343 ',' => Token::Comma,
344 ':' => Token::Colon,
345 ';' => Token::Semicolon,
346 '|' => Token::Or, c => Token::Error(c.to_string()),
348 })
349 }
350}
351
352fn is_unicode_combining(c: char) -> bool {
356 let cp = c as u32;
357 matches!(cp,
358 0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05C7 | 0x0610..=0x061A | 0x064B..=0x065F | 0x0670 | 0x06D6..=0x06DC | 0x0730..=0x074A | 0x0816..=0x082D | 0x0900..=0x0903 | 0x093A..=0x094F | 0x0951..=0x0957 | 0x0962..=0x0963 | 0x0981..=0x0983 | 0x09BC | 0x09BE..=0x09C4 | 0x09C7..=0x09C8 | 0x09CB..=0x09CD | 0x0A01..=0x0A03 | 0x0A3C | 0x0A3E..=0x0A42 | 0x0B01..=0x0B03 | 0x0B3C..=0x0B4D | 0x0C00..=0x0C03 | 0x0C3E..=0x0C56 | 0x0D00..=0x0D03 | 0x0D3B..=0x0D4D | 0x0E31 | 0x0E34..=0x0E3A | 0x0E47..=0x0E4E | 0x0EB1 | 0x0EB4..=0x0EBC | 0x0EC8..=0x0ECD | 0x3099..=0x309A | 0xFE20..=0xFE2F )
394}