libperl_macrogen/
lexer.rs

1use crate::error::{CompileError, LexError, Result};
2use crate::intern::{InternedStr, StringInterner};
3use crate::source::{FileId, SourceLocation};
4use crate::token::{Comment, CommentKind, Token, TokenKind};
5
6/// 識別子解決トレイト
7///
8/// Lexer が識別子をトークン化する際に使用する。
9/// 通常モード（Interning）では新しい識別子を intern し、
10/// 読み取り専用モード（LookupOnly）では既存の識別子のみを lookup する。
11pub trait IdentResolver {
12    /// 識別子文字列を InternedStr に解決
13    ///
14    /// 通常モード: intern して常に成功（Some を返す）
15    /// 読み取り専用モード: lookup のみ、見つからなければ None
16    fn resolve_ident(&mut self, s: &str) -> Option<InternedStr>;
17}
18
19/// 通常の intern を行うラッパー
20pub struct Interning<'a>(pub &'a mut StringInterner);
21
22impl IdentResolver for Interning<'_> {
23    fn resolve_ident(&mut self, s: &str) -> Option<InternedStr> {
24        Some(self.0.intern(s)) // 常に成功
25    }
26}
27
28/// lookup のみ行うラッパー（読み取り専用）
29pub struct LookupOnly<'a>(pub &'a StringInterner);
30
31impl IdentResolver for LookupOnly<'_> {
32    fn resolve_ident(&mut self, s: &str) -> Option<InternedStr> {
33        self.0.lookup(s) // 見つからなければ None
34    }
35}
36
37/// Lexer（ジェネリック版）
38pub struct Lexer<'a, R: IdentResolver> {
39    source: &'a [u8],
40    pos: usize,
41    line: u32,
42    column: u32,
43    file_id: FileId,
44    resolver: R,
45    /// スペース/タブをトークンとして返すかどうか（TinyCC の PARSE_FLAG_SPACES 相当）
46    return_spaces: bool,
47    /// ライフタイムマーカー（'a を使用するため）
48    _marker: std::marker::PhantomData<&'a ()>,
49}
50
51/// 通常の Lexer（mutable interner を使用）
52pub type MutableLexer<'a> = Lexer<'a, Interning<'a>>;
53
54/// 読み取り専用 Lexer（lookup のみ）
55pub type ReadOnlyLexer<'a> = Lexer<'a, LookupOnly<'a>>;
56
57impl<'a> Lexer<'a, Interning<'a>> {
58    /// 新しいLexerを作成（通常モード）
59    pub fn new(source: &'a [u8], file_id: FileId, interner: &'a mut StringInterner) -> Self {
60        Self {
61            source,
62            pos: 0,
63            line: 1,
64            column: 1,
65            file_id,
66            resolver: Interning(interner),
67            return_spaces: false,
68            _marker: std::marker::PhantomData,
69        }
70    }
71}
72
73impl<'a> Lexer<'a, LookupOnly<'a>> {
74    /// 新しいLexerを作成（読み取り専用モード）
75    ///
76    /// このモードでは、既に intern 済みの識別子のみを認識する。
77    /// 未知の識別子を検出した場合は LexError::UnknownIdentifier を返す。
78    pub fn new_readonly(source: &'a [u8], file_id: FileId, interner: &'a StringInterner) -> Self {
79        Self {
80            source,
81            pos: 0,
82            line: 1,
83            column: 1,
84            file_id,
85            resolver: LookupOnly(interner),
86            return_spaces: false,
87            _marker: std::marker::PhantomData,
88        }
89    }
90}
91
92impl<'a, R: IdentResolver> Lexer<'a, R> {
93
94    /// スペースをトークンとして返すかどうかを設定
95    pub fn set_return_spaces(&mut self, enabled: bool) {
96        self.return_spaces = enabled;
97    }
98
99    /// 現在のスペース返却モードを取得
100    pub fn return_spaces(&self) -> bool {
101        self.return_spaces
102    }
103
104    /// 現在位置を取得
105    pub fn current_location(&self) -> SourceLocation {
106        SourceLocation::new(self.file_id, self.line, self.column)
107    }
108
109    /// ファイルIDを取得
110    pub fn file_id(&self) -> FileId {
111        self.file_id
112    }
113
114    /// 次のトークンを取得
115    pub fn next_token(&mut self) -> Result<Token> {
116        let mut leading_comments = Vec::new();
117
118        loop {
119            // return_spaces モードの場合、空白をトークンとして返す
120            if self.return_spaces {
121                if let Some(c) = self.peek() {
122                    if c == b' ' || c == b'\t' {
123                        let loc = self.current_location();
124                        self.advance();
125                        // 連続する空白は1つのSpaceトークンにまとめる
126                        while let Some(c) = self.peek() {
127                            if c == b' ' || c == b'\t' {
128                                self.advance();
129                            } else {
130                                break;
131                            }
132                        }
133                        return Ok(Token::with_comments(TokenKind::Space, loc, leading_comments));
134                    }
135                }
136            } else {
137                self.skip_whitespace();
138            }
139
140            match (self.peek(), self.peek_n(1)) {
141                (Some(b'/'), Some(b'/')) => {
142                    let comment = self.scan_line_comment();
143                    leading_comments.push(comment);
144                }
145                (Some(b'/'), Some(b'*')) => {
146                    let comment = self.scan_block_comment()?;
147                    leading_comments.push(comment);
148                }
149                _ => break,
150            }
151        }
152
153        let loc = self.current_location();
154        let kind = self.scan_token_kind()?;
155
156        Ok(Token::with_comments(kind, loc, leading_comments))
157    }
158
159    /// 現在の文字をピーク
160    fn peek(&self) -> Option<u8> {
161        self.source.get(self.pos).copied()
162    }
163
164    /// n文字先をピーク
165    fn peek_n(&self, n: usize) -> Option<u8> {
166        self.source.get(self.pos + n).copied()
167    }
168
169    /// 1文字進める
170    fn advance(&mut self) -> Option<u8> {
171        let c = self.peek()?;
172        self.pos += 1;
173        if c == b'\n' {
174            self.line += 1;
175            self.column = 1;
176        } else {
177            self.column += 1;
178        }
179        Some(c)
180    }
181
182    /// 空白をスキップ（改行は含まない - プリプロセッサのため）
183    fn skip_whitespace(&mut self) {
184        while let Some(c) = self.peek() {
185            if c == b' ' || c == b'\t' || c == b'\r' {
186                self.advance();
187            } else {
188                break;
189            }
190        }
191    }
192
193    /// 行コメントをスキャン
194    fn scan_line_comment(&mut self) -> Comment {
195        let loc = self.current_location();
196        self.advance(); // /
197        self.advance(); // /
198
199        let start = self.pos;
200        while self.peek().is_some_and(|c| c != b'\n') {
201            self.advance();
202        }
203        let text = String::from_utf8_lossy(&self.source[start..self.pos]).to_string();
204
205        Comment::new(CommentKind::Line, text, loc)
206    }
207
208    /// ブロックコメントをスキャン
209    fn scan_block_comment(&mut self) -> Result<Comment> {
210        let loc = self.current_location();
211        self.advance(); // /
212        self.advance(); // *
213
214        let start = self.pos;
215        loop {
216            match (self.peek(), self.peek_n(1)) {
217                (Some(b'*'), Some(b'/')) => {
218                    let text = String::from_utf8_lossy(&self.source[start..self.pos]).to_string();
219                    self.advance(); // *
220                    self.advance(); // /
221                    return Ok(Comment::new(CommentKind::Block, text, loc));
222                }
223                (Some(_), _) => {
224                    self.advance();
225                }
226                (None, _) => {
227                    return Err(CompileError::Lex {
228                        loc,
229                        kind: LexError::UnterminatedComment,
230                    });
231                }
232            }
233        }
234    }
235
236    /// トークン種別をスキャン
237    fn scan_token_kind(&mut self) -> Result<TokenKind> {
238        let Some(c) = self.peek() else {
239            return Ok(TokenKind::Eof);
240        };
241
242        match c {
243            // 改行（プリプロセッサのために独立したトークンとして扱う）
244            b'\n' => {
245                self.advance();
246                Ok(TokenKind::Newline)
247            }
248            // ワイド文字列/文字リテラル（識別子より先にチェック）
249            b'L' if matches!(self.peek_n(1), Some(b'"') | Some(b'\'')) => {
250                self.advance(); // L
251                if self.peek() == Some(b'"') {
252                    self.scan_wide_string()
253                } else {
254                    self.scan_wide_char()
255                }
256            }
257
258            // 識別子またはキーワード
259            b'a'..=b'z' | b'A'..=b'Z' | b'_' => self.scan_identifier(),
260
261            // 数値リテラル
262            b'0'..=b'9' => self.scan_number(),
263
264            // 文字列リテラル
265            b'"' => self.scan_string(),
266
267            // 文字リテラル
268            b'\'' => self.scan_char(),
269
270            // 演算子・区切り記号
271            b'+' => self.scan_plus(),
272            b'-' => self.scan_minus(),
273            b'*' => self.scan_star(),
274            b'/' => self.scan_slash(),
275            b'%' => self.scan_percent(),
276            b'&' => self.scan_amp(),
277            b'|' => self.scan_pipe(),
278            b'^' => self.scan_caret(),
279            b'~' => {
280                self.advance();
281                Ok(TokenKind::Tilde)
282            }
283            b'!' => self.scan_bang(),
284            b'<' => self.scan_lt(),
285            b'>' => self.scan_gt(),
286            b'=' => self.scan_eq(),
287            b'?' => {
288                self.advance();
289                Ok(TokenKind::Question)
290            }
291            b':' => {
292                self.advance();
293                Ok(TokenKind::Colon)
294            }
295            b'.' => self.scan_dot(),
296            b',' => {
297                self.advance();
298                Ok(TokenKind::Comma)
299            }
300            b';' => {
301                self.advance();
302                Ok(TokenKind::Semi)
303            }
304            b'(' => {
305                self.advance();
306                Ok(TokenKind::LParen)
307            }
308            b')' => {
309                self.advance();
310                Ok(TokenKind::RParen)
311            }
312            b'[' => {
313                self.advance();
314                Ok(TokenKind::LBracket)
315            }
316            b']' => {
317                self.advance();
318                Ok(TokenKind::RBracket)
319            }
320            b'{' => {
321                self.advance();
322                Ok(TokenKind::LBrace)
323            }
324            b'}' => {
325                self.advance();
326                Ok(TokenKind::RBrace)
327            }
328            b'#' => self.scan_hash(),
329
330            _ => {
331                let loc = self.current_location();
332                self.advance();
333                Err(CompileError::Lex {
334                    loc,
335                    kind: LexError::InvalidChar(c as char),
336                })
337            }
338        }
339    }
340
341    /// 識別子またはキーワードをスキャン
342    fn scan_identifier(&mut self) -> Result<TokenKind> {
343        let loc = self.current_location();
344        let start = self.pos;
345        while let Some(c) = self.peek() {
346            if c.is_ascii_alphanumeric() || c == b'_' {
347                self.advance();
348            } else {
349                break;
350            }
351        }
352
353        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
354
355        // キーワードなら対応するTokenKindを返す
356        if let Some(kw) = TokenKind::from_keyword(text) {
357            Ok(kw)
358        } else {
359            // resolver を使って識別子を解決
360            match self.resolver.resolve_ident(text) {
361                Some(interned) => Ok(TokenKind::Ident(interned)),
362                None => Err(CompileError::Lex {
363                    loc,
364                    kind: LexError::UnknownIdentifier(text.to_string()),
365                }),
366            }
367        }
368    }
369
370    /// 数値リテラルをスキャン
371    fn scan_number(&mut self) -> Result<TokenKind> {
372        let loc = self.current_location();
373        let start = self.pos;
374
375        // 16進数、8進数、2進数の判定
376        if self.peek() == Some(b'0') {
377            self.advance();
378            match self.peek() {
379                Some(b'x') | Some(b'X') => return self.scan_hex_number(start, loc),
380                Some(b'b') | Some(b'B') => return self.scan_binary_number(start, loc),
381                Some(b'0'..=b'7') => return self.scan_octal_number(start, loc),
382                Some(b'.') | Some(b'e') | Some(b'E') => {
383                    // 浮動小数点
384                    return self.scan_float_number(start, loc);
385                }
386                _ => {
387                    // 単なる 0
388                    return self.finish_integer(start, loc);
389                }
390            }
391        }
392
393        // 10進数
394        while self.peek().is_some_and(|c| c.is_ascii_digit()) {
395            self.advance();
396        }
397
398        // 浮動小数点チェック
399        if matches!(self.peek(), Some(b'.') | Some(b'e') | Some(b'E')) {
400            return self.scan_float_number(start, loc);
401        }
402
403        self.finish_integer(start, loc)
404    }
405
406    /// 16進数をスキャン
407    fn scan_hex_number(&mut self, start: usize, loc: SourceLocation) -> Result<TokenKind> {
408        self.advance(); // x or X
409
410        let hex_start = self.pos;
411        while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
412            self.advance();
413        }
414
415        if self.pos == hex_start {
416            return Err(CompileError::Lex {
417                loc,
418                kind: LexError::InvalidNumber("0x".to_string()),
419            });
420        }
421
422        self.finish_integer(start, loc)
423    }
424
425    /// 2進数をスキャン
426    fn scan_binary_number(&mut self, start: usize, loc: SourceLocation) -> Result<TokenKind> {
427        self.advance(); // b or B
428
429        let bin_start = self.pos;
430        while matches!(self.peek(), Some(b'0') | Some(b'1')) {
431            self.advance();
432        }
433
434        if self.pos == bin_start {
435            return Err(CompileError::Lex {
436                loc,
437                kind: LexError::InvalidNumber("0b".to_string()),
438            });
439        }
440
441        self.finish_integer(start, loc)
442    }
443
444    /// 8進数をスキャン
445    fn scan_octal_number(&mut self, start: usize, loc: SourceLocation) -> Result<TokenKind> {
446        while self.peek().is_some_and(|c| matches!(c, b'0'..=b'7')) {
447            self.advance();
448        }
449
450        self.finish_integer(start, loc)
451    }
452
453    /// 浮動小数点数をスキャン
454    fn scan_float_number(&mut self, start: usize, loc: SourceLocation) -> Result<TokenKind> {
455        // 小数部
456        if self.peek() == Some(b'.') {
457            self.advance();
458            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
459                self.advance();
460            }
461        }
462
463        // 指数部
464        if matches!(self.peek(), Some(b'e') | Some(b'E')) {
465            self.advance();
466            if matches!(self.peek(), Some(b'+') | Some(b'-')) {
467                self.advance();
468            }
469            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
470                self.advance();
471            }
472        }
473
474        // サフィックス
475        let _is_float = matches!(self.peek(), Some(b'f') | Some(b'F'));
476        let _is_long = matches!(self.peek(), Some(b'l') | Some(b'L'));
477        if _is_float || _is_long {
478            self.advance();
479        }
480
481        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
482        let value: f64 = text
483            .trim_end_matches(|c| c == 'f' || c == 'F' || c == 'l' || c == 'L')
484            .parse()
485            .map_err(|_| CompileError::Lex {
486                loc: loc.clone(),
487                kind: LexError::InvalidNumber(text.to_string()),
488            })?;
489
490        Ok(TokenKind::FloatLit(value))
491    }
492
493    /// 整数リテラルの仕上げ（サフィックス処理）
494    fn finish_integer(&mut self, start: usize, loc: SourceLocation) -> Result<TokenKind> {
495        // サフィックス: u/U, l/L, ll/LL
496        let mut is_unsigned = false;
497        let mut is_long = false;
498        let mut is_longlong = false;
499
500        loop {
501            match self.peek() {
502                Some(b'u') | Some(b'U') => {
503                    is_unsigned = true;
504                    self.advance();
505                }
506                Some(b'l') | Some(b'L') => {
507                    if is_long {
508                        is_longlong = true;
509                    }
510                    is_long = true;
511                    self.advance();
512                }
513                _ => break,
514            }
515        }
516
517        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
518        let num_text = text
519            .trim_start_matches("0x")
520            .trim_start_matches("0X")
521            .trim_start_matches("0b")
522            .trim_start_matches("0B")
523            .trim_end_matches(|c: char| c == 'u' || c == 'U' || c == 'l' || c == 'L');
524
525        let radix = if text.starts_with("0x") || text.starts_with("0X") {
526            16
527        } else if text.starts_with("0b") || text.starts_with("0B") {
528            2
529        } else if text.starts_with('0') && text.len() > 1 && !text.contains('.') {
530            8
531        } else {
532            10
533        };
534
535        if is_unsigned || is_longlong {
536            let value = u64::from_str_radix(num_text, radix).map_err(|_| CompileError::Lex {
537                loc: loc.clone(),
538                kind: LexError::InvalidNumber(text.to_string()),
539            })?;
540            Ok(TokenKind::UIntLit(value))
541        } else {
542            let value = i64::from_str_radix(num_text, radix).map_err(|_| CompileError::Lex {
543                loc: loc.clone(),
544                kind: LexError::InvalidNumber(text.to_string()),
545            })?;
546            Ok(TokenKind::IntLit(value))
547        }
548    }
549
550    /// 文字列リテラルをスキャン
551    fn scan_string(&mut self) -> Result<TokenKind> {
552        let loc = self.current_location();
553        self.advance(); // "
554
555        let mut bytes = Vec::new();
556        loop {
557            match self.peek() {
558                Some(b'"') => {
559                    self.advance();
560                    return Ok(TokenKind::StringLit(bytes));
561                }
562                Some(b'\\') => {
563                    self.advance();
564                    let escaped = self.scan_escape_sequence(&loc)?;
565                    bytes.push(escaped);
566                }
567                Some(b'\n') | None => {
568                    return Err(CompileError::Lex {
569                        loc,
570                        kind: LexError::UnterminatedString,
571                    });
572                }
573                Some(c) => {
574                    self.advance();
575                    bytes.push(c);
576                }
577            }
578        }
579    }
580
581    /// ワイド文字列リテラルをスキャン
582    fn scan_wide_string(&mut self) -> Result<TokenKind> {
583        let loc = self.current_location();
584        self.advance(); // "
585
586        let mut chars = Vec::new();
587        loop {
588            match self.peek() {
589                Some(b'"') => {
590                    self.advance();
591                    return Ok(TokenKind::WideStringLit(chars));
592                }
593                Some(b'\\') => {
594                    self.advance();
595                    let escaped = self.scan_escape_sequence(&loc)?;
596                    chars.push(escaped as u32);
597                }
598                Some(b'\n') | None => {
599                    return Err(CompileError::Lex {
600                        loc,
601                        kind: LexError::UnterminatedString,
602                    });
603                }
604                Some(c) => {
605                    self.advance();
606                    chars.push(c as u32);
607                }
608            }
609        }
610    }
611
612    /// 文字リテラルをスキャン
613    fn scan_char(&mut self) -> Result<TokenKind> {
614        let loc = self.current_location();
615        self.advance(); // '
616
617        let value = match self.peek() {
618            Some(b'\'') => {
619                return Err(CompileError::Lex {
620                    loc,
621                    kind: LexError::EmptyCharLit,
622                });
623            }
624            Some(b'\\') => {
625                self.advance();
626                self.scan_escape_sequence(&loc)?
627            }
628            Some(c) => {
629                self.advance();
630                c
631            }
632            None => {
633                return Err(CompileError::Lex {
634                    loc,
635                    kind: LexError::UnterminatedChar,
636                });
637            }
638        };
639
640        if self.peek() != Some(b'\'') {
641            return Err(CompileError::Lex {
642                loc,
643                kind: LexError::UnterminatedChar,
644            });
645        }
646        self.advance(); // '
647
648        Ok(TokenKind::CharLit(value))
649    }
650
651    /// ワイド文字リテラルをスキャン
652    fn scan_wide_char(&mut self) -> Result<TokenKind> {
653        let loc = self.current_location();
654        self.advance(); // '
655
656        let value = match self.peek() {
657            Some(b'\'') => {
658                return Err(CompileError::Lex {
659                    loc,
660                    kind: LexError::EmptyCharLit,
661                });
662            }
663            Some(b'\\') => {
664                self.advance();
665                self.scan_escape_sequence(&loc)? as u32
666            }
667            Some(c) => {
668                self.advance();
669                c as u32
670            }
671            None => {
672                return Err(CompileError::Lex {
673                    loc,
674                    kind: LexError::UnterminatedChar,
675                });
676            }
677        };
678
679        if self.peek() != Some(b'\'') {
680            return Err(CompileError::Lex {
681                loc,
682                kind: LexError::UnterminatedChar,
683            });
684        }
685        self.advance(); // '
686
687        Ok(TokenKind::WideCharLit(value))
688    }
689
690    /// エスケープシーケンスをスキャン
691    fn scan_escape_sequence(&mut self, loc: &SourceLocation) -> Result<u8> {
692        match self.peek() {
693            Some(b'n') => {
694                self.advance();
695                Ok(b'\n')
696            }
697            Some(b't') => {
698                self.advance();
699                Ok(b'\t')
700            }
701            Some(b'r') => {
702                self.advance();
703                Ok(b'\r')
704            }
705            Some(b'\\') => {
706                self.advance();
707                Ok(b'\\')
708            }
709            Some(b'\'') => {
710                self.advance();
711                Ok(b'\'')
712            }
713            Some(b'"') => {
714                self.advance();
715                Ok(b'"')
716            }
717            Some(b'0') => {
718                self.advance();
719                Ok(0)
720            }
721            Some(b'a') => {
722                self.advance();
723                Ok(0x07) // bell
724            }
725            Some(b'b') => {
726                self.advance();
727                Ok(0x08) // backspace
728            }
729            Some(b'f') => {
730                self.advance();
731                Ok(0x0C) // form feed
732            }
733            Some(b'v') => {
734                self.advance();
735                Ok(0x0B) // vertical tab
736            }
737            Some(b'x') => {
738                self.advance();
739                self.scan_hex_escape(loc)
740            }
741            Some(c @ b'0'..=b'7') => self.scan_octal_escape(c),
742            Some(c) => Err(CompileError::Lex {
743                loc: loc.clone(),
744                kind: LexError::InvalidEscape(c as char),
745            }),
746            None => Err(CompileError::Lex {
747                loc: loc.clone(),
748                kind: LexError::UnterminatedString,
749            }),
750        }
751    }
752
753    /// 16進エスケープをスキャン
754    fn scan_hex_escape(&mut self, loc: &SourceLocation) -> Result<u8> {
755        let mut value = 0u8;
756        let mut count = 0;
757
758        while let Some(c) = self.peek() {
759            if let Some(digit) = (c as char).to_digit(16) {
760                value = value.wrapping_mul(16).wrapping_add(digit as u8);
761                self.advance();
762                count += 1;
763                if count >= 2 {
764                    break;
765                }
766            } else {
767                break;
768            }
769        }
770
771        if count == 0 {
772            return Err(CompileError::Lex {
773                loc: loc.clone(),
774                kind: LexError::InvalidEscape('x'),
775            });
776        }
777
778        Ok(value)
779    }
780
781    /// 8進エスケープをスキャン
782    fn scan_octal_escape(&mut self, first: u8) -> Result<u8> {
783        let mut value = (first - b'0') as u8;
784        self.advance();
785
786        for _ in 0..2 {
787            if let Some(c @ b'0'..=b'7') = self.peek() {
788                value = value * 8 + (c - b'0');
789                self.advance();
790            } else {
791                break;
792            }
793        }
794
795        Ok(value)
796    }
797
798    // === 演算子スキャン ===
799
800    fn scan_plus(&mut self) -> Result<TokenKind> {
801        self.advance();
802        match self.peek() {
803            Some(b'+') => {
804                self.advance();
805                Ok(TokenKind::PlusPlus)
806            }
807            Some(b'=') => {
808                self.advance();
809                Ok(TokenKind::PlusEq)
810            }
811            _ => Ok(TokenKind::Plus),
812        }
813    }
814
815    fn scan_minus(&mut self) -> Result<TokenKind> {
816        self.advance();
817        match self.peek() {
818            Some(b'-') => {
819                self.advance();
820                Ok(TokenKind::MinusMinus)
821            }
822            Some(b'=') => {
823                self.advance();
824                Ok(TokenKind::MinusEq)
825            }
826            Some(b'>') => {
827                self.advance();
828                Ok(TokenKind::Arrow)
829            }
830            _ => Ok(TokenKind::Minus),
831        }
832    }
833
834    fn scan_star(&mut self) -> Result<TokenKind> {
835        self.advance();
836        if self.peek() == Some(b'=') {
837            self.advance();
838            Ok(TokenKind::StarEq)
839        } else {
840            Ok(TokenKind::Star)
841        }
842    }
843
844    fn scan_slash(&mut self) -> Result<TokenKind> {
845        self.advance();
846        if self.peek() == Some(b'=') {
847            self.advance();
848            Ok(TokenKind::SlashEq)
849        } else {
850            Ok(TokenKind::Slash)
851        }
852    }
853
854    fn scan_percent(&mut self) -> Result<TokenKind> {
855        self.advance();
856        if self.peek() == Some(b'=') {
857            self.advance();
858            Ok(TokenKind::PercentEq)
859        } else {
860            Ok(TokenKind::Percent)
861        }
862    }
863
864    fn scan_amp(&mut self) -> Result<TokenKind> {
865        self.advance();
866        match self.peek() {
867            Some(b'&') => {
868                self.advance();
869                Ok(TokenKind::AmpAmp)
870            }
871            Some(b'=') => {
872                self.advance();
873                Ok(TokenKind::AmpEq)
874            }
875            _ => Ok(TokenKind::Amp),
876        }
877    }
878
879    fn scan_pipe(&mut self) -> Result<TokenKind> {
880        self.advance();
881        match self.peek() {
882            Some(b'|') => {
883                self.advance();
884                Ok(TokenKind::PipePipe)
885            }
886            Some(b'=') => {
887                self.advance();
888                Ok(TokenKind::PipeEq)
889            }
890            _ => Ok(TokenKind::Pipe),
891        }
892    }
893
894    fn scan_caret(&mut self) -> Result<TokenKind> {
895        self.advance();
896        if self.peek() == Some(b'=') {
897            self.advance();
898            Ok(TokenKind::CaretEq)
899        } else {
900            Ok(TokenKind::Caret)
901        }
902    }
903
904    fn scan_bang(&mut self) -> Result<TokenKind> {
905        self.advance();
906        if self.peek() == Some(b'=') {
907            self.advance();
908            Ok(TokenKind::BangEq)
909        } else {
910            Ok(TokenKind::Bang)
911        }
912    }
913
914    fn scan_lt(&mut self) -> Result<TokenKind> {
915        self.advance();
916        match self.peek() {
917            Some(b'<') => {
918                self.advance();
919                if self.peek() == Some(b'=') {
920                    self.advance();
921                    Ok(TokenKind::LtLtEq)
922                } else {
923                    Ok(TokenKind::LtLt)
924                }
925            }
926            Some(b'=') => {
927                self.advance();
928                Ok(TokenKind::LtEq)
929            }
930            _ => Ok(TokenKind::Lt),
931        }
932    }
933
934    fn scan_gt(&mut self) -> Result<TokenKind> {
935        self.advance();
936        match self.peek() {
937            Some(b'>') => {
938                self.advance();
939                if self.peek() == Some(b'=') {
940                    self.advance();
941                    Ok(TokenKind::GtGtEq)
942                } else {
943                    Ok(TokenKind::GtGt)
944                }
945            }
946            Some(b'=') => {
947                self.advance();
948                Ok(TokenKind::GtEq)
949            }
950            _ => Ok(TokenKind::Gt),
951        }
952    }
953
954    fn scan_eq(&mut self) -> Result<TokenKind> {
955        self.advance();
956        if self.peek() == Some(b'=') {
957            self.advance();
958            Ok(TokenKind::EqEq)
959        } else {
960            Ok(TokenKind::Eq)
961        }
962    }
963
964    fn scan_dot(&mut self) -> Result<TokenKind> {
965        self.advance();
966        if self.peek() == Some(b'.') && self.peek_n(1) == Some(b'.') {
967            self.advance();
968            self.advance();
969            Ok(TokenKind::Ellipsis)
970        } else {
971            Ok(TokenKind::Dot)
972        }
973    }
974
975    fn scan_hash(&mut self) -> Result<TokenKind> {
976        self.advance();
977        if self.peek() == Some(b'#') {
978            self.advance();
979            Ok(TokenKind::HashHash)
980        } else {
981            Ok(TokenKind::Hash)
982        }
983    }
984}
985
986#[cfg(test)]
987mod tests {
988    use super::*;
989
990    fn lex(source: &str) -> Vec<TokenKind> {
991        let mut interner = StringInterner::new();
992        let mut lexer = Lexer::new(source.as_bytes(), FileId::default(), &mut interner);
993        let mut tokens = Vec::new();
994        loop {
995            let token = lexer.next_token().unwrap();
996            if matches!(token.kind, TokenKind::Eof) {
997                break;
998            }
999            tokens.push(token.kind);
1000        }
1001        tokens
1002    }
1003
1004    #[test]
1005    fn test_operators() {
1006        let tokens = lex("+ - * / % ++ -- += -= -> == != <= >=");
1007        assert_eq!(
1008            tokens,
1009            vec![
1010                TokenKind::Plus,
1011                TokenKind::Minus,
1012                TokenKind::Star,
1013                TokenKind::Slash,
1014                TokenKind::Percent,
1015                TokenKind::PlusPlus,
1016                TokenKind::MinusMinus,
1017                TokenKind::PlusEq,
1018                TokenKind::MinusEq,
1019                TokenKind::Arrow,
1020                TokenKind::EqEq,
1021                TokenKind::BangEq,
1022                TokenKind::LtEq,
1023                TokenKind::GtEq,
1024            ]
1025        );
1026    }
1027
1028    #[test]
1029    fn test_keywords_and_identifiers() {
1030        // キーワードはTokenKind::Kw*として返し、識別子はTokenKind::Identとして返す
1031        let mut interner = StringInterner::new();
1032        let mut lexer = Lexer::new(
1033            b"int if else while for return struct foo",
1034            FileId::default(),
1035            &mut interner,
1036        );
1037
1038        let mut tokens = Vec::new();
1039        loop {
1040            let token = lexer.next_token().unwrap();
1041            if matches!(token.kind, TokenKind::Eof) {
1042                break;
1043            }
1044            tokens.push(token.kind);
1045        }
1046
1047        // キーワードはキーワードトークンとして返される
1048        assert!(matches!(tokens[0], TokenKind::KwInt));
1049        assert!(matches!(tokens[1], TokenKind::KwIf));
1050        assert!(matches!(tokens[2], TokenKind::KwElse));
1051        assert!(matches!(tokens[3], TokenKind::KwWhile));
1052        assert!(matches!(tokens[4], TokenKind::KwFor));
1053        assert!(matches!(tokens[5], TokenKind::KwReturn));
1054        assert!(matches!(tokens[6], TokenKind::KwStruct));
1055        // 識別子は識別子トークンとして返される
1056        if let TokenKind::Ident(id) = tokens[7] {
1057            assert_eq!(interner.get(id), "foo");
1058        } else {
1059            panic!("Expected Ident for 'foo'");
1060        }
1061    }
1062
1063    #[test]
1064    fn test_numbers() {
1065        let tokens = lex("42 0x1F 0b101 0777 3.14 1e10");
1066        assert_eq!(
1067            tokens,
1068            vec![
1069                TokenKind::IntLit(42),
1070                TokenKind::IntLit(0x1F),
1071                TokenKind::IntLit(0b101),
1072                TokenKind::IntLit(0o777),
1073                TokenKind::FloatLit(3.14),
1074                TokenKind::FloatLit(1e10),
1075            ]
1076        );
1077    }
1078
1079    #[test]
1080    fn test_strings() {
1081        let tokens = lex(r#""hello" "world\n""#);
1082        assert_eq!(
1083            tokens,
1084            vec![
1085                TokenKind::StringLit(b"hello".to_vec()),
1086                TokenKind::StringLit(b"world\n".to_vec()),
1087            ]
1088        );
1089    }
1090
1091    #[test]
1092    fn test_comments() {
1093        let mut interner = StringInterner::new();
1094        let mut lexer = Lexer::new(
1095            b"// line comment\n42 /* block */ 100",
1096            FileId::default(),
1097            &mut interner,
1098        );
1099
1100        // コメントの後に改行がある場合、改行トークンにコメントが付く
1101        let newline = lexer.next_token().unwrap();
1102        assert_eq!(newline.kind, TokenKind::Newline);
1103        assert_eq!(newline.leading_comments.len(), 1);
1104        assert_eq!(newline.leading_comments[0].kind, CommentKind::Line);
1105
1106        let tok1 = lexer.next_token().unwrap();
1107        assert_eq!(tok1.kind, TokenKind::IntLit(42));
1108
1109        let tok2 = lexer.next_token().unwrap();
1110        assert_eq!(tok2.kind, TokenKind::IntLit(100));
1111        assert_eq!(tok2.leading_comments.len(), 1);
1112        assert_eq!(tok2.leading_comments[0].kind, CommentKind::Block);
1113    }
1114
1115    #[test]
1116    fn test_ellipsis() {
1117        let tokens = lex("...");
1118        assert_eq!(tokens, vec![TokenKind::Ellipsis]);
1119    }
1120}
libperl_macrogen/lexer.rs

libperl_macrogen/
lexer.rs