Skip to main content

parser/
lib.rs

1use std::fmt::Debug;
2
3use anyhow::{Result, anyhow};
4use dynamic::{ConstIntOp, Dynamic, Type};
5use smol_str::SmolStr;
6
7mod expr;
8pub use expr::{BinaryOp, Expr, ExprKind, UnaryOp};
9
10mod pattern;
11pub use pattern::{Pattern, PatternKind};
12
13mod stmt;
14pub use stmt::{Stmt, StmtKind};
15
16#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
17pub struct Span {
18    pub start: usize,
19    pub end: usize,
20}
21
22impl Span {
23    pub const fn new(start: usize, end: usize) -> Self {
24        Self { start, end }
25    }
26
27    pub const fn empty(pos: usize) -> Self {
28        Self { start: pos, end: pos }
29    }
30
31    pub fn merge(self, other: Self) -> Self {
32        Self { start: self.start.min(other.start), end: self.end.max(other.end) }
33    }
34}
35
36#[derive(Debug)]
37pub struct Parser {
38    pos: usize,   //当前解析的位置
39    buf: Vec<u8>, //待解析的字符串
40    spans: Vec<usize>,
41}
42
43const NOT_IDENT: &[u8] = &[b' ', b'\t', b'\n', b'\r', b'/', b'*', b'+', b'-', b'=', b'(', b')', b'{', b'}', b'[', b']', b';', b':', b',', b'.', b'<', b'>', b'!', b'#', b'$', b'%', b'^', b'&', b'|', b'\\', b'"', b'\''];
44const WHITE_SPACE: &[u8] = &[b' ', b'\t', b'\n', b'\r'];
45const TYPES: &[(&str, Type)] = &[
46    ("bool", Type::Bool),
47    ("string", Type::Str),
48    ("i8", Type::I8),
49    ("i16", Type::I16),
50    ("i32", Type::I32),
51    ("i64", Type::I64),
52    ("u8", Type::U8),
53    ("u16", Type::U16),
54    ("u32", Type::U32),
55    ("u64", Type::U64),
56    ("f16", Type::F16),
57    ("f32", Type::F32),
58    ("f64", Type::F64),
59];
60const KEYWORDS: &[&str] = &["true", "false", "null", "let", "if", "else", "for", "in", "while", "pub", "fn", "struct", "impl", "const", "static", "continue", "return", "break"];
61
62#[macro_export]
63macro_rules! parse_list {
64    ($self: ident, $start: expr, $end: expr, $sep: expr, $item_expr: expr) => {{
65        let mut items = $start;
66        loop {
67            $self.whitespace()?;
68            if $self.get()? == $end {
69                $self.pos += 1;
70                break;
71            }
72            let item = $item_expr;
73            items.push(item);
74            $self.whitespace()?;
75            if $self.get()? == $sep {
76                $self.pos += 1;
77            }
78        }
79        items
80    }};
81}
82
83#[macro_export]
84macro_rules! try_parse {
85    ($self: ident, $method: expr) => {{
86        let save_pos = $self.pos; //保存当前 pos
87        match $method {
88            Ok(expr) => Ok(expr),
89            Err(e) => {
90                $self.pos = save_pos;
91                Err(e)
92            }
93        }
94    }};
95}
96
97#[derive(Debug, thiserror::Error)]
98pub enum ParserErr {
99    #[error("期望字符 {0} 实际字符 {1}")]
100    ExpectChar(char, char),
101    #[error("未发现期望字符")]
102    NoCharCollect,
103    #[error("期望字符串 {0}")]
104    ExpectedString(SmolStr),
105    #[error("输入结束")]
106    EndofInput,
107    #[error("未关闭的注释")]
108    UncloseComment,
109    #[error("非法的原始字符串")]
110    IllegalRawString,
111    #[error("未关闭字符串")]
112    UnclosedString,
113    #[error("非字符串")]
114    NotString,
115    #[error("非数字")]
116    NotNumber,
117}
118
119impl Parser {
120    pub fn new(buf: Vec<u8>) -> Self {
121        Self { pos: 0, buf, spans: Vec::new() }
122    }
123
124    pub fn is_eof(&self) -> bool {
125        self.pos >= self.buf.len()
126    }
127
128    pub fn get(&self) -> Result<u8> {
129        //查看当前字符
130        self.buf.get(self.pos).cloned().ok_or(ParserErr::EndofInput.into())
131    }
132
133    pub fn take(&mut self, ch: u8) -> Result<()> {
134        //如果当前字符为 ch 消费该字符 返回 Ok(())
135        if self.buf.get(self.pos).map(|b| *b == ch).unwrap_or(false) {
136            self.pos += 1;
137            Ok(())
138        } else {
139            Err(ParserErr::ExpectChar(ch as char, self.buf.get(self.pos as usize).cloned().unwrap_or(0) as char).into())
140        }
141    }
142
143    pub fn until(&mut self, ch: u8) -> Result<()> {
144        //消费直到指定字符 ch 忽略空白和注释
145        self.whitespace()?;
146        self.take(ch)
147    }
148
149    pub fn ahead(&self) -> Result<u8> {
150        //朝前看
151        self.buf.get(self.pos + 1).cloned().ok_or(ParserErr::EndofInput.into())
152    }
153
154    pub fn get_str(&self, start: usize, stop: usize) -> SmolStr {
155        SmolStr::from(String::from_utf8_lossy(&self.buf[start..stop]))
156    }
157
158    pub fn error_stmt(&self) -> SmolStr {
159        SmolStr::from(String::from_utf8_lossy(&self.buf[self.spans.last().cloned().unwrap_or(0)..self.pos]))
160    }
161
162    pub fn current_pos(&self) -> usize {
163        self.pos
164    }
165
166    pub fn span_from(&self, start: usize) -> Span {
167        Span::new(start, self.pos)
168    }
169
170    pub fn collect<F: Fn(u8) -> bool>(&mut self, f: F) -> Result<(usize, usize)> {
171        let start = self.pos;
172        while self.pos < self.buf.len() && f(self.buf[self.pos]) {
173            self.pos += 1;
174        }
175        if self.pos > start { Ok((start, self.pos)) } else { Err(ParserErr::NoCharCollect.into()) }
176    }
177
178    pub fn just(&mut self, pattern: &str) -> Result<()> {
179        if self.buf.len() - self.pos >= pattern.len() && self.buf[self.pos..self.pos + pattern.len()].eq(pattern.as_bytes()) {
180            self.pos += pattern.len();
181            Ok(())
182        } else {
183            Err(ParserErr::ExpectedString(SmolStr::new(pattern)).into())
184        }
185    }
186
187    pub fn keyword(&mut self, pattern: &str) -> Result<()> {
188        self.just(pattern)?;
189        if self.pos < self.buf.len() && !NOT_IDENT.contains(&self.buf[self.pos]) {
190            self.pos -= pattern.len();
191            return Err(ParserErr::ExpectedString(SmolStr::new(pattern)).into());
192        }
193        Ok(())
194    }
195
196    pub fn get_type(&mut self) -> Result<Type> {
197        self.whitespace()?;
198        if self.get()? == b'[' {
199            self.pos += 1;
200            let ty = self.get_type()?;
201            self.until(b';')?;
202            self.whitespace()?;
203            let len = self.get_type_param()?;
204            self.until(b']')?;
205            if let Type::ConstInt(number) = len {
206                let number = u32::try_from(number).map_err(|_| anyhow!("数组长度超出 u32 范围"))?;
207                Ok(Type::Array(std::rc::Rc::new(ty), number))
208            } else {
209                Ok(Type::ArrayParam(std::rc::Rc::new(ty), std::rc::Rc::new(len)))
210            }
211        } else {
212            for ty in TYPES {
213                if self.just(ty.0).is_ok() {
214                    return Ok(ty.1.clone());
215                }
216            }
217            let name = self.ident()?;
218            if self.take(b'<').is_ok() {
219                let params = crate::parse_list!(self, Vec::new(), b'>', b',', self.get_type_param()?);
220                Ok(Type::Ident { name, params })
221            } else {
222                Ok(Type::Ident { name, params: Vec::new() })
223            }
224        }
225    }
226
227    pub fn get_type_param(&mut self) -> Result<Type> {
228        self.const_type_param_add()
229    }
230
231    fn const_type_param_add(&mut self) -> Result<Type> {
232        let mut left = self.const_type_param_mul()?;
233        loop {
234            self.whitespace()?;
235            let op = if self.take(b'+').is_ok() {
236                Some(ConstIntOp::Add)
237            } else if self.take(b'-').is_ok() {
238                Some(ConstIntOp::Sub)
239            } else {
240                None
241            };
242            let Some(op) = op else { break };
243            let right = self.const_type_param_mul()?;
244            left = Self::fold_const_type_binary(op, left, right)?;
245        }
246        Ok(left)
247    }
248
249    fn const_type_param_mul(&mut self) -> Result<Type> {
250        let mut left = self.const_type_param_primary()?;
251        loop {
252            self.whitespace()?;
253            let op = if self.take(b'*').is_ok() {
254                Some(ConstIntOp::Mul)
255            } else if self.take(b'/').is_ok() {
256                Some(ConstIntOp::Div)
257            } else if self.take(b'%').is_ok() {
258                Some(ConstIntOp::Mod)
259            } else {
260                None
261            };
262            let Some(op) = op else { break };
263            let right = self.const_type_param_primary()?;
264            left = Self::fold_const_type_binary(op, left, right)?;
265        }
266        Ok(left)
267    }
268
269    fn const_type_param_primary(&mut self) -> Result<Type> {
270        self.whitespace()?;
271        if self.take(b'(').is_ok() {
272            let ty = self.get_type_param()?;
273            self.until(b')')?;
274            return Ok(ty);
275        }
276        if self.get()?.is_ascii_digit() {
277            let value = self.number()?;
278            if let Some(value) = value.as_uint() {
279                let value = i64::try_from(value).map_err(|_| anyhow!("模板数字参数超出 i64 范围"))?;
280                Ok(Type::ConstInt(value))
281            } else if let Some(value) = value.as_int() {
282                Ok(Type::ConstInt(value))
283            } else {
284                Err(anyhow!("模板数字参数必须是整数"))
285            }
286        } else {
287            self.get_type()
288        }
289    }
290
291    fn fold_const_type_binary(op: ConstIntOp, left: Type, right: Type) -> Result<Type> {
292        if let (Type::ConstInt(left), Type::ConstInt(right)) = (&left, &right) {
293            let value = match op {
294                ConstIntOp::Add => left + right,
295                ConstIntOp::Sub => left - right,
296                ConstIntOp::Mul => left * right,
297                ConstIntOp::Div => {
298                    if *right == 0 {
299                        return Err(anyhow!("模板整数除以 0"));
300                    }
301                    left / right
302                }
303                ConstIntOp::Mod => {
304                    if *right == 0 {
305                        return Err(anyhow!("模板整数取模 0"));
306                    }
307                    left % right
308                }
309            };
310            Ok(Type::ConstInt(value))
311        } else {
312            Ok(Type::ConstBinary { op, left: std::rc::Rc::new(left), right: std::rc::Rc::new(right) })
313        }
314    }
315
316    pub fn comment(&mut self) -> Result<()> {
317        if self.get()? == b'/' && self.ahead()? == b'/' {
318            self.pos += 2;
319            while self.pos < self.buf.len() && self.buf[self.pos] != b'\n' {
320                self.pos += 1;
321            }
322            Ok(())
323        } else if self.get()? == b'/' && self.ahead()? == b'*' {
324            self.pos += 2;
325            while self.pos + 1 < self.buf.len() {
326                if self.buf[self.pos] == b'*' && self.buf[self.pos + 1] == b'/' {
327                    self.pos += 2;
328                    return Ok(());
329                }
330                self.pos += 1;
331            }
332            Err(ParserErr::UncloseComment.into())
333        } else {
334            Ok(())
335        }
336    }
337
338    pub fn whitespace(&mut self) -> Result<()> {
339        while self.pos < self.buf.len() {
340            self.comment()?;
341            if self.pos >= self.buf.len() || !WHITE_SPACE.contains(&self.buf[self.pos]) {
342                break;
343            }
344            self.pos += 1;
345        }
346        Ok(())
347    }
348
349    pub fn ident(&mut self) -> Result<SmolStr> {
350        let (start, mut stop) = self.collect(|ch| !NOT_IDENT.contains(&ch))?;
351        while self.just("::").is_ok() {
352            (_, stop) = self.collect(|ch| !NOT_IDENT.contains(&ch))?;
353        }
354        if KEYWORDS.iter().position(|k| k.as_bytes() == &self.buf[start..stop]).is_some() {
355            return Err(anyhow!("发现关键字{}", String::from_utf8_lossy(&self.buf[start..stop])));
356        }
357        Ok(self.get_str(start, stop))
358    }
359
360    pub fn string(&mut self) -> Result<SmolStr> {
361        if self.buf[self.pos] == b'"' {
362            self.pos += 1;
363            let mut text_buf = Vec::new();
364            while self.pos < self.buf.len() {
365                if self.buf[self.pos] == b'\\' {
366                    //转义字符
367                    self.pos += 1;
368                    match self.buf[self.pos] {
369                        ch @ (b'n' | b'r' | b't' | b'\\' | b'"') => {
370                            text_buf.push(ch);
371                            self.pos += 1;
372                        }
373                        b'u' => {
374                            self.pos += 1;
375                            let unicode = if self.take(b'{').is_ok() {
376                                let code = self.hex()?;
377                                self.pos += 1;
378                                code
379                            } else {
380                                self.hex()?
381                            };
382                            let ch = char::from_u32(unicode as u32).ok_or(anyhow!("非法 unicode {}", unicode))?;
383                            let mut utf8_buf = [0u8; 4];
384                            let s = ch.encode_utf8(&mut utf8_buf);
385                            text_buf.extend_from_slice(s.as_bytes());
386                        }
387                        b'x' => {
388                            self.pos += 1;
389                            if self.pos + 2 < self.buf.len() {
390                                let start = self.pos;
391                                self.pos += 2;
392                                let hex = &self.buf[start..self.pos];
393                                let code = u32::from_str_radix(String::from_utf8_lossy(hex).as_ref(), 16)?;
394                                text_buf.push(code as u8);
395                            }
396                        }
397                        other => {
398                            return Err(anyhow!("invalid escape character: {}", other as char));
399                        }
400                    }
401                } else {
402                    if self.buf[self.pos] == b'"' {
403                        self.pos += 1;
404                        return Ok(String::from_utf8(text_buf)?.into());
405                    }
406                    text_buf.push(self.buf[self.pos]);
407                    self.pos += 1;
408                }
409            }
410            Err(ParserErr::UnclosedString.into())
411        } else {
412            Err(ParserErr::NotString.into())
413        }
414    }
415
416    pub fn text(&mut self) -> Result<SmolStr> {
417        if self.get()? == b'r' && [b'#', b'"'].contains(&self.ahead()?) {
418            self.pos += 1;
419            let mut end = String::new();
420            while self.buf[self.pos] == b'#' {
421                end.push('#');
422                self.pos += 1;
423            }
424            if self.get()? != b'"' {
425                return Err(ParserErr::IllegalRawString.into());
426            }
427            self.pos += 1;
428            let start_pos = self.pos;
429            while self.pos < self.buf.len() {
430                if self.just(&end).is_ok() {
431                    break;
432                }
433                self.pos += 1;
434            }
435            Ok(self.get_str(start_pos, self.pos - end.len()))
436        } else {
437            self.string()
438        }
439    }
440
441    fn hex(&mut self) -> Result<i32> {
442        //注意 hex 会消耗当前字符 设置新的 self.pos
443        let (start, stop) = self.collect(|ch| (ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F'))?;
444        Ok(i32::from_str_radix(&String::from_utf8_lossy(&self.buf[start..stop]), 16)?)
445    }
446
447    fn numeric_suffix(&mut self) -> Option<Type> {
448        let save = self.pos;
449        for (name, ty) in TYPES {
450            if !ty.is_native() || *ty == Type::F16 {
451                continue;
452            }
453            if self.buf.len() >= self.pos + name.len() && self.buf[self.pos..self.pos + name.len()].eq(name.as_bytes()) {
454                self.pos += name.len();
455                return Some(ty.clone());
456            }
457        }
458        self.pos = save;
459        None
460    }
461
462    fn int_literal(&mut self, digits: &str, radix: u32, suffix: Option<Type>) -> Result<Dynamic> {
463        Ok(match suffix.unwrap_or(Type::I32) {
464            Type::I8 => Dynamic::I8(i128::from_str_radix(digits, radix)? as i8),
465            Type::I16 => Dynamic::I16(i128::from_str_radix(digits, radix)? as i16),
466            Type::I32 => Dynamic::I32(i128::from_str_radix(digits, radix)? as i32),
467            Type::I64 => Dynamic::I64(i128::from_str_radix(digits, radix)? as i64),
468            Type::U8 => Dynamic::U8(u128::from_str_radix(digits, radix)? as u8),
469            Type::U16 => Dynamic::U16(u128::from_str_radix(digits, radix)? as u16),
470            Type::U32 => Dynamic::U32(u128::from_str_radix(digits, radix)? as u32),
471            Type::U64 => Dynamic::U64(u128::from_str_radix(digits, radix)? as u64),
472            Type::F32 => Dynamic::F32(u128::from_str_radix(digits, radix)? as f32),
473            Type::F64 => Dynamic::F64(u128::from_str_radix(digits, radix)? as f64),
474            ty => return Err(anyhow!("{:?} 不能作为数字后缀", ty)),
475        })
476    }
477
478    fn float_literal(&mut self, digits: &str, suffix: Option<Type>) -> Result<Dynamic> {
479        let value: f64 = digits.parse()?;
480        Ok(match suffix.unwrap_or(Type::F32) {
481            Type::I8 => Dynamic::I8(value as i8),
482            Type::I16 => Dynamic::I16(value as i16),
483            Type::I32 => Dynamic::I32(value as i32),
484            Type::I64 => Dynamic::I64(value as i64),
485            Type::U8 => Dynamic::U8(value as u8),
486            Type::U16 => Dynamic::U16(value as u16),
487            Type::U32 => Dynamic::U32(value as u32),
488            Type::U64 => Dynamic::U64(value as u64),
489            Type::F32 => Dynamic::F32(value as f32),
490            Type::F64 => Dynamic::F64(value),
491            ty => return Err(anyhow!("{:?} 不能作为浮点数字后缀", ty)),
492        })
493    }
494
495    pub fn number(&mut self) -> Result<Dynamic> {
496        if self.get()? == b'0' {
497            if [b'b', b'B'].contains(&self.ahead()?) {
498                self.pos += 2;
499                let (start, stop) = self.collect(|ch| ch == b'0' || ch == b'1')?;
500                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
501                let suffix = self.numeric_suffix();
502                return self.int_literal(&s, 2, suffix);
503            } else if [b'o', b'O'].contains(&self.ahead()?) {
504                self.pos += 2;
505                let (start, stop) = self.collect(|ch| ch >= b'0' && ch <= b'7')?;
506                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
507                let suffix = self.numeric_suffix();
508                return self.int_literal(&s, 8, suffix);
509            } else if [b'x', b'X'].contains(&self.ahead()?) {
510                self.pos += 2;
511                let (start, stop) = self.collect(|ch| (ch >= b'0' && ch <= b'9') || (ch >= b'a' && ch <= b'f') || (ch >= b'A' && ch <= b'F'))?;
512                let s = String::from_utf8_lossy(&self.buf[start..stop]).to_string();
513                let suffix = self.numeric_suffix();
514                return self.int_literal(&s, 16, suffix);
515            }
516        }
517        let start = self.pos;
518        while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
519            self.pos += 1;
520        }
521        if self.pos < self.buf.len() && self.buf[self.pos] == b'.' && self.ahead().map(|ch| ch <= b'9' && ch >= b'0').unwrap_or(false) {
522            self.pos += 1;
523            while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
524                self.pos += 1;
525            }
526            if self.pos < self.buf.len() && (self.buf[self.pos] == b'e' || self.buf[self.pos] == b'E') && self.ahead().map(|ch| ch <= b'9' && ch >= b'0').unwrap_or(false) {
527                while self.pos < self.buf.len() && self.buf[self.pos] <= b'9' && self.buf[self.pos] >= b'0' {
528                    self.pos += 1;
529                }
530            }
531            if self.pos > start {
532                let text = String::from_utf8_lossy(&self.buf[start..self.pos]).to_string();
533                let suffix = self.numeric_suffix();
534                return self.float_literal(&text, suffix);
535            }
536        } else {
537            if self.pos > start {
538                let text = String::from_utf8_lossy(&self.buf[start..self.pos]).to_string();
539                let suffix = self.numeric_suffix();
540                return self.int_literal(&text, 10, suffix);
541            }
542        }
543        Err(ParserErr::NotNumber.into())
544    }
545}