Skip to main content

parser/
lib.rs

1use std::fmt::Debug;
2
3use anyhow::{Result, anyhow};
4use dynamic::{ConstIntOp, Dynamic, Type};
5use smol_str::SmolStr;
6
7mod expr;
8pub use expr::{BinaryOp, Expr, ExprKind, UnaryOp};
9
10mod pattern;
11pub use pattern::{Pattern, PatternKind};
12
13mod stmt;
14pub use stmt::{Stmt, StmtKind};
15
16#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
17pub struct Span {
18    pub start: usize,
19    pub end: usize,
20}
21
22impl Span {
23    pub const fn new(start: usize, end: usize) -> Self {
24        Self { start, end }
25    }
26
27    pub const fn empty(pos: usize) -> Self {
28        Self { start: pos, end: pos }
29    }
30
31    pub fn merge(self, other: Self) -> Self {
32        Self { start: self.start.min(other.start), end: self.end.max(other.end) }
33    }
34}
35
36#[derive(Debug)]
37pub struct Parser {
38    pos: usize,   //当前解析的位置
39    buf: Vec<u8>, //待解析的字符串
40    spans: Vec<usize>,
41}
42
43const NOT_IDENT: &[u8] = &[b' ', b'\t', b'\n', b'\r', b'/', b'*', b'+', b'-', b'=', b'(', b')', b'{', b'}', b'[', b']', b';', b':', b',', b'.', b'<', b'>', b'!', b'#', b'$', b'%', b'^', b'&', b'|', b'\\', b'"', b'\''];
44const WHITE_SPACE: &[u8] = &[b' ', b'\t', b'\n', b'\r'];
45const TYPES: &[(&str, Type)] = &[
46    ("bool", Type::Bool),
47    ("string", Type::Str),
48    ("i8", Type::I8),
49    ("i16", Type::I16),
50    ("i32", Type::I32),
51    ("i64", Type::I64),
52    ("u8", Type::U8),
53    ("u16", Type::U16),
54    ("u32", Type::U32),
55    ("u64", Type::U64),
56    ("f16", Type::F16),
57    ("f32", Type::F32),
58    ("f64", Type::F64),
59];
60const KEYWORDS: &[&str] = &["true", "false", "null", "let", "if", "else", "for", "in", "while", "pub", "fn", "struct", "impl", "const", "static", "continue", "return", "break"];
61
62#[macro_export]
63macro_rules! parse_list {
64    ($self: ident, $start: expr, $end: expr, $sep: expr, $item_expr: expr) => {{
65        let mut items = $start;
66        loop {
67            $self.whitespace()?;
68            if $self.get()? == $end {
69                $self.pos += 1;
70                break;
71            }
72            let item = $item_expr;
73            items.push(item);
74            $self.whitespace()?;
75            if $self.get()? == $sep {
76                $self.pos += 1;
77            }
78        }
79        items
80    }};
81}
82
83#[macro_export]
84macro_rules! try_parse {
85    ($self: ident, $method: expr) => {{
86        let save_pos = $self.pos; //保存当前 pos
87        match $method {
88            Ok(expr) => Ok(expr),
89            Err(e) => {
90                $self.pos = save_pos;
91                Err(e)
92            }
93        }
94    }};
95}
96
97#[derive(Debug, thiserror::Error)]
98pub enum ParserErr {
99    #[error("期望字符 {0} 实际字符 {1}")]
100    ExpectChar(char, char),
101    #[error("未发现期望字符")]
102    NoCharCollect,
103    #[error("期望字符串 {0}")]
104    ExpectedString(SmolStr),
105    #[error("输入结束")]
106    EndofInput,
107    #[error("未关闭的注释")]
108    UncloseComment,
109    #[error("非法的原始字符串")]
110    IllegalRawString,
111    #[error("未关闭字符串")]
112    UnclosedString,
113    #[error("非字符串")]
114    NotString,
115    #[error("非数字")]
116    NotNumber,
117}
118
119impl Parser {
120    pub fn new(buf: Vec<u8>) -> Self {
121        Self { pos: 0, buf, spans: Vec::new() }
122    }
123
124    pub fn is_eof(&self) -> bool {
125        self.pos >= self.buf.len()
126    }
127
128    pub fn get(&self) -> Result<u8> {
129        //查看当前字符
130        self.buf.get(self.pos).cloned().ok_or(ParserErr::EndofInput.into())
131    }
132
133    pub fn take(&mut self, ch: u8) -> Result<()> {
134        //如果当前字符为 ch 消费该字符 返回 Ok(())
135        if self.buf.get(self.pos).map(|b| *b == ch).unwrap_or(false) {
136            self.pos += 1;
137            Ok(())
138        } else {
139            Err(ParserErr::ExpectChar(ch as char, self.buf.get(self.pos as usize).cloned().unwrap_or(0) as char).into())
140        }
141    }
142
143    pub fn until(&mut self, ch: u8) -> Result<()> {
144        //消费直到指定字符 ch 忽略空白和注释
145        self.whitespace()?;
146        self.take(ch)
147    }
148
149    pub fn ahead(&self) -> Result<u8> {
150        //朝前看
151        self.buf.get(self.pos + 1).cloned().ok_or(ParserErr::EndofInput.into())
152    }
153
154    pub fn get_str(&self, start: usize, stop: usize) -> SmolStr {
155        SmolStr::from(String::from_utf8_lossy(&self.buf[start..stop]))
156    }
157
158    pub fn error_stmt(&self) -> SmolStr {
159        SmolStr::from(String::from_utf8_lossy(&self.buf[self.spans.last().cloned().unwrap_or(0)..self.pos]))
160    }
161
162    pub fn current_pos(&self) -> usize {
163        self.pos
164    }
165
166    pub fn span_from(&self, start: usize) -> Span {
167        Span::new(start, self.pos)
168    }
169
170    pub fn collect<F: Fn(u8) -> bool>(&mut self, f: F) -> Result<(usize, usize)> {
171        let start = self.pos;
172        while self.pos < self.buf.len() && f(self.buf[self.pos]) {
173            self.pos += 1;
174        }
175        if self.pos > start { Ok((start, self.pos)) } else { Err(ParserErr::NoCharCollect.into()) }
176    }
177
178    pub fn just(&mut self, pattern: &str) -> Result<()> {
179        if self.buf.len() - self.pos >= pattern.len() && self.buf[self.pos..self.pos + pattern.len()].eq(pattern.as_bytes()) {
180            self.pos += pattern.len();
181            Ok(())
182        } else {
183            Err(ParserErr::ExpectedString(SmolStr::new(pattern)).into())
184        }
185    }
186
187    pub fn keyword(&mut self, pattern: &str) -> Result<()> {
188        self.just(pattern)?;
189        if self.pos < self.buf.len() && !NOT_IDENT.contains(&self.buf[self.pos]) {
190            self.pos -= pattern.len();
191            return Err(ParserErr::ExpectedString(SmolStr::new(pattern)).into());
192        }
193        Ok(())
194    }
195
196    pub fn get_type(&mut self) -> Result<Type> {
197        self.whitespace()?;
198        if self.get()? == b'[' {
199            self.pos += 1;
200            let ty = self.get_type()?;
201            self.until(b';')?;
202            self.whitespace()?;
203            let len = self.get_type_param()?;
204            self.until(b']')?;
205            if let Type::ConstInt(number) = len {
206                let number = u32::try_from(number).map_err(|_| anyhow!("数组长度超出 u32 范围"))?;
207                Ok(Type::Array(std::rc::Rc::new(ty), number))
208            } else {
209                Ok(Type::ArrayParam(std::rc::Rc::new(ty), std::rc::Rc::new(len)))
210            }
211        } else {
212            for ty in TYPES {
213                if self.just(ty.0).is_ok() {
214                    return Ok(ty.1.clone());
215                }
216            }
217            let name = self.ident()?;
218            if self.take(b'<').is_ok() {
219                let params = crate::parse_list!(self, Vec::new(), b'>', b',', self.get_type_param()?);
220                Ok(Type::Ident { name, params })
221            } else {
222                Ok(Type::Ident { name, params: Vec::new() })
223            }
224        }
225    }
226
227    pub fn get_type_param(&mut self) -> Result<Type> {
228        self.const_type_param_add()
229    }
230
231    fn const_type_param_add(&mut self) -> Result<Type> {
232        let mut left = self.const_type_param_mul()?;
233        loop {
234            self.whitespace()?;
235            let op = if self.take(b'+').is_ok() {
236                Some(ConstIntOp::Add)
237            } else if self.take(b'-').is_ok() {
238                Some(ConstIntOp::Sub)
239            } else {
240                None
241            };
242            let Some(op) = op else { break };
243            let right = self.const_type_param_mul()?;
244            left = Self::fold_const_type_binary(op, left, right)?;
245        }
246        Ok(left)
247    }
248
249    fn const_type_param_mul(&mut self) -> Result<Type> {
250        let mut left = self.const_type_param_primary()?;
251        loop {
252            self.whitespace()?;
253            let op = if self.take(b'*').is_ok() {
254                Some(ConstIntOp::Mul)
255            } else if self.take(b'/').is_ok() {
256                Some(ConstIntOp::Div)
257            } else if self.take(b'%').is_ok() {
258                Some(ConstIntOp::Mod)
259            } else {
260                None
261            };
262            let Some(op) = op else { break };
263            let right = self.const_type_param_primary()?;
264            left = Self::fold_const_type_binary(op, left, right)?;
265        }
266        Ok(left)
267    }
268
269    fn const_type_param_primary(&mut self) -> Result<Type> {
270        self.whitespace()?;
271        if self.take(b'(').is_ok() {
272            let ty = self.get_type_param()?;
273            self.until(b')')?;
274            return Ok(ty);
275        }
276        if self.get()?.is_ascii_digit() {
277            let value = self.number()?;
278            if let Some(value) = value.as_uint() {
279                let value = i64::try_from(value).map_err(|_| anyhow!("模板数字参数超出 i64 范围"))?;
280                Ok(Type::ConstInt(value))
281            } else if let Some(value) = value.as_int() {
282                Ok(Type::ConstInt(value))
283            } else {
284                Err(anyhow!("模板数字参数必须是整数"))
285            }
286        } else {
287            self.get_type()
288        }
289    }
290
291    fn fold_const_type_binary(op: ConstIntOp, left: Type, right: Type) -> Result<Type> {
292        if let (Type::ConstInt(left), Type::ConstInt(right)) = (&left, &right) {
293            let value = match op {
294                ConstIntOp::Add => left + right,
295                ConstIntOp::Sub => left - right,
296                ConstIntOp::Mul => left * right,
297                ConstIntOp::Div => {
298                    if *right == 0 {
299                        return Err(anyhow!("模板整数除以 0"));
300                    }
301                    left / right
302                }
303                ConstIntOp::Mod => {
304                    if *right == 0 {
305                        return Err(anyhow!("模板整数取模 0"));
306                    }
307                    left % right
308                }
309            };
310            Ok(Type::ConstInt(value))
311        } else {
312            Ok(Type::ConstBinary { op, left: std::rc::Rc::new(left), right: std::rc::Rc::new(right) })
313        }
314    }
315
316    pub fn comment(&mut self) -> Result<()> {
317        if self.get()? == b'/' && self.ahead()? == b'/' {
318            self.pos += 2;
319            while self.pos < self.buf.len() && self.buf[self.pos] != b'\n' {
320                self.pos += 1;
321            }
322            Ok(())
323        } else if self.get()? == b'/' && self.ahead()? == b'*' {
324            self.pos += 2;
325            while self.pos + 1 < self.buf.len() {
326                if self.buf[self.pos] == b'*' && self.buf[self.pos + 1] == b'/' {
327                    self.pos += 2;
328                    return Ok(());
329                }
330                self.pos += 1;
331            }
332            Err(ParserErr::UncloseComment.into())
333        } else {
334            Ok(())
335        }
336    }
337
338    pub fn whitespace(&mut self) -> Result<()> {
339        while self.pos < self.buf.len() {
340            self.comment()?;
341            if self.pos >= self.buf.len() || !WHITE_SPACE.contains(&self.buf[self.pos]) {
342                break;
343            }
344            self.pos += 1;
345        }
346        Ok(())
347    }
348
349    pub fn ident(&mut self) -> Result<SmolStr> {
350        let (start, mut stop) = self.collect(|ch| !NOT_IDENT.contains(&ch))?;
351        loop {
352            let save_pos = self.pos;
353            if self.just("::").is_err() {
354                break;
355            }
356            match self.collect(|ch| !NOT_IDENT.contains(&ch)) {
357                Ok((_, next_stop)) => {
358                    stop = next_stop;
359                }
360                Err(_) => {
361                    self.pos = save_pos;
362                    break;
363                }
364            }
365        }
366        if KEYWORDS.iter().position(|k| k.as_bytes() == &self.buf[start..stop]).is_some() {
367            return Err(anyhow!("发现关键字{}", String::from_utf8_lossy(&self.buf[start..stop])));
368        }
369        Ok(self.get_str(start, stop))
370    }
371
372    pub fn string(&mut self) -> Result<SmolStr> {
373        if self.buf[self.pos] == b'"' {
374            self.pos += 1;
375            let mut text_buf = Vec::new();
376            while self.pos < self.buf.len() {
377                if self.buf[self.pos] == b'\\' {
378                    //转义字符
379                    self.pos += 1;
380                    match self.buf[self.pos] {
381                        ch @ (b'n' | b'r' | b't' | b'\\' | b'"') => {
382                            text_buf.push(ch);
383                            self.pos += 1;
384                        }
385                        b'u' => {
386                            self.pos += 1;
387                            let unicode = if self.take(b'{').is_ok() {
388                                let code = self.hex()?;
389                                self.pos += 1;
390                                code
391                            } else {
392                                self.hex()?
393                            };
394                            let ch = char::from_u32(unicode as u32).ok_or(anyhow!("非法 unicode {}", unicode))?;
395                            let mut utf8_buf = [0u8; 4];
396                            let s = ch.encode_utf8(&mut utf8_buf);
397                            text_buf.extend_from_slice(s.as_bytes());
398                        }
399                        b'x' => {
400                            self.pos += 1;
401                            if self.pos + 2 < self.buf.len() {
402                                let start = self.pos;
403                                self.pos += 2;
404                                let hex = &self.buf[start..self.pos];
405                                let code = u32::from_str_radix(String::from_utf8_lossy(hex).as_ref(), 16)?;
406                                text_buf.push(code as u8);
407                            }
408                        }
409                        other => {
410                            return Err(anyhow!("invalid escape character: {}", other as char));
411                        }
412                    }
413                } else {
414                    if self.buf[self.pos] == b'"' {
415                        self.pos += 1;
416                        return Ok(String::from_utf8(text_buf)?.into());
417                    }
418                    text_buf.push(self.buf[self.pos]);
419                    self.pos += 1;
420                }
421            }
422            Err(ParserErr::UnclosedString.into())
423        } else {
424            Err(ParserErr::NotString.into())
425        }
426    }
427
428    pub fn text(&mut self) -> Result<SmolStr> {
429        if self.get()? == b'r' && [b'#', b'"'].contains(&self.ahead()?) {
430            self.pos += 1;
431            let mut end = String::new();
432            while self.buf[self.pos] == b'#' {
433                end.push('#');
434                self.pos += 1;
435            }
436            if self.get()? != b'"' {
437                return Err(ParserErr::IllegalRawString.into());
438            }
439            self.pos += 1;
440            let start_pos = self.pos;
441            while self.pos < self.buf.len() {
442                if self.just(&end).is_ok() {
443                    break;
444                }
445                self.pos += 1;
446            }
447            Ok(self.get_str(start_pos, self.pos - end.len()))
448        } else {
449            self.string()
450        }
451    }
452
453    fn hex(&mut self) -> Result<i32> {
454        //注意 hex 会消耗当前字符 设置新的 self.pos
455        let (start, stop) = self.collect(|ch| (ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F'))?;
456        Ok(i32::from_str_radix(&String::from_utf8_lossy(&self.buf[start..stop]), 16)?)
457    }
458
459    fn numeric_suffix(&mut self) -> Option<Type> {
460        let save = self.pos;
461        for (name, ty) in TYPES {
462            if !ty.is_native() || *ty == Type::F16 {
463                continue;
464            }
465            if self.buf.len() >= self.pos + name.len() && self.buf[self.pos..self.pos + name.len()].eq(name.as_bytes()) {
466                self.pos += name.len();
467                return Some(ty.clone());
468            }
469        }
470        self.pos = save;
471        None
472    }
473
474    fn int_literal(&mut self, digits: &str, radix: u32, suffix: Option<Type>) -> Result<Dynamic> {
475        Ok(match suffix.unwrap_or(Type::I32) {
476            Type::I8 => Dynamic::I8(i128::from_str_radix(digits, radix)? as i8),
477            Type::I16 => Dynamic::I16(i128::from_str_radix(digits, radix)? as i16),
478            Type::I32 => Dynamic::I32(i128::from_str_radix(digits, radix)? as i32),
479            Type::I64 => Dynamic::I64(i128::from_str_radix(digits, radix)? as i64),
480            Type::U8 => Dynamic::U8(u128::from_str_radix(digits, radix)? as u8),
481            Type::U16 => Dynamic::U16(u128::from_str_radix(digits, radix)? as u16),
482            Type::U32 => Dynamic::U32(u128::from_str_radix(digits, radix)? as u32),
483            Type::U64 => Dynamic::U64(u128::from_str_radix(digits, radix)? as u64),
484            Type::F32 => Dynamic::F32(u128::from_str_radix(digits, radix)? as f32),
485            Type::F64 => Dynamic::F64(u128::from_str_radix(digits, radix)? as f64),
486            ty => return Err(anyhow!("{:?} 不能作为数字后缀", ty)),
487        })
488    }
489
490    fn float_literal(&mut self, digits: &str, suffix: Option<Type>) -> Result<Dynamic> {
491        let value: f64 = digits.parse()?;
492        Ok(match suffix.unwrap_or(Type::F32) {
493            Type::I8 => Dynamic::I8(value as i8),
494            Type::I16 => Dynamic::I16(value as i16),
495            Type::I32 => Dynamic::I32(value as i32),
496            Type::I64 => Dynamic::I64(value as i64),
497            Type::U8 => Dynamic::U8(value as u8),
498            Type::U16 => Dynamic::U16(value as u16),
499            Type::U32 => Dynamic::U32(value as u32),
500            Type::U64 => Dynamic::U64(value as u64),
501            Type::F32 => Dynamic::F32(value as f32),
502            Type::F64 => Dynamic::F64(value),
503            ty => return Err(anyhow!("{:?} 不能作为浮点数字后缀", ty)),
504        })
505    }
506
507    pub fn number(&mut self) -> Result<Dynamic> {
508        if self.get()? == b'0' {
509            if [b'b', b'B'].contains(&self.ahead()?) {
510                self.pos += 2;
511                let (start, stop) = self.collect(|ch| ch == b'0' || ch == b'1')?;
512                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
513                let suffix = self.numeric_suffix();
514                return self.int_literal(&s, 2, suffix);
515            } else if [b'o', b'O'].contains(&self.ahead()?) {
516                self.pos += 2;
517                let (start, stop) = self.collect(|ch| ch >= b'0' && ch <= b'7')?;
518                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
519                let suffix = self.numeric_suffix();
520                return self.int_literal(&s, 8, suffix);
521            } else if [b'x', b'X'].contains(&self.ahead()?) {
522                self.pos += 2;
523                let (start, stop) = self.collect(|ch| (ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F'))?;
524                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
525                let suffix = self.numeric_suffix();
526                return self.int_literal(&s, 16, suffix);
527            }
528        }
529        let start = self.pos;
530        while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
531            self.pos += 1;
532        }
533        if self.pos < self.buf.len() && self.buf[self.pos] == b'.' && self.ahead().map(|ch| ch <= b'9' && ch >= b'0').unwrap_or(false) {
534            self.pos += 1;
535            while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
536                self.pos += 1;
537            }
538            if self.pos < self.buf.len() && (self.buf[self.pos] == b'e' || self.buf[self.pos] == b'E') && self.ahead().map(|ch| ch <= b'9' && ch >= b'0').unwrap_or(false) {
539                while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
540                    self.pos += 1;
541                }
542            }
543            if self.pos > start {
544                let text = String::from_utf8_lossy(&self.buf[start..self.pos]).to_string();
545                let suffix = self.numeric_suffix();
546                return self.float_literal(&text, suffix);
547            }
548        } else {
549            if self.pos > start {
550                let text = String::from_utf8_lossy(&self.buf[start..self.pos]).to_string();
551                let suffix = self.numeric_suffix();
552                return self.int_literal(&text, 10, suffix);
553            }
554        }
555        Err(ParserErr::NotNumber.into())
556    }
557}