oak_csv/lexer/
mod.rs

1use crate::{kind::CsvSyntaxKind, language::CsvLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, SourceText, lexer::LexOutput, source::Source};
3
4type State<'input> = LexerState<&'input SourceText, CsvLanguage>;
5
6pub struct CsvLexer {
7    field_separator: char,
8    quote_char: char,
9}
10
11impl CsvLexer {
12    pub fn new(_config: CsvLanguage) -> Self {
13        Self { field_separator: ',', quote_char: '"' }
14    }
15
16    pub fn with_separator(mut self, separator: char) -> Self {
17        self.field_separator = separator;
18        self
19    }
20
21    pub fn with_quote_char(mut self, quote: char) -> Self {
22        self.quote_char = quote;
23        self
24    }
25
26    /// 跳过空白字符
27    fn skip_whitespace(&self, state: &mut State<'_>) -> bool {
28        let start_pos = state.get_position();
29        let mut found_whitespace = false;
30
31        while let Some(ch) = state.peek() {
32            if ch == ' ' || ch == '\t' {
33                state.advance(ch.len_utf8());
34                found_whitespace = true;
35            }
36            else {
37                break;
38            }
39        }
40
41        if found_whitespace {
42            state.add_token(CsvSyntaxKind::Whitespace, start_pos, state.get_position());
43            true
44        }
45        else {
46            false
47        }
48    }
49
50    /// 处理换行
51    fn lex_newline(&self, state: &mut State<'_>) -> bool {
52        let start_pos = state.get_position();
53
54        if let Some(ch) = state.peek() {
55            if ch == '\r' {
56                state.advance(1);
57                // 检查是否是 CRLF
58                if state.peek() == Some('\n') {
59                    state.advance(1);
60                }
61                state.add_token(CsvSyntaxKind::Newline, start_pos, state.get_position());
62                true
63            }
64            else if ch == '\n' {
65                state.advance(1);
66                state.add_token(CsvSyntaxKind::Newline, start_pos, state.get_position());
67                true
68            }
69            else {
70                false
71            }
72        }
73        else {
74            false
75        }
76    }
77
78    /// 处理带引号的字段
79    fn lex_quoted_field(&self, state: &mut State<'_>) -> bool {
80        let start_pos = state.get_position();
81
82        if let Some(ch) = state.peek() {
83            if ch == self.quote_char {
84                state.advance(ch.len_utf8()); // 跳过开始引
85                while let Some(ch) = state.peek() {
86                    if ch == self.quote_char {
87                        state.advance(ch.len_utf8());
88                        // 检查是否是转义引号(双引号
89                        if state.peek() == Some(self.quote_char) {
90                            state.advance(self.quote_char.len_utf8()); // 跳过转义引号
91                        }
92                        else {
93                            // 结束引号
94                            break;
95                        }
96                    }
97                    else {
98                        state.advance(ch.len_utf8());
99                    }
100                }
101
102                state.add_token(CsvSyntaxKind::QuotedField, start_pos, state.get_position());
103                true
104            }
105            else {
106                false
107            }
108        }
109        else {
110            false
111        }
112    }
113
114    /// 处理不带引号的字
115    fn lex_unquoted_field(&self, state: &mut State<'_>) -> bool {
116        let start_pos = state.get_position();
117        let mut found_content = false;
118
119        while let Some(ch) = state.peek() {
120            if ch == self.field_separator || ch == '\n' || ch == '\r' {
121                break;
122            }
123            state.advance(ch.len_utf8());
124            found_content = true;
125        }
126
127        if found_content {
128            state.add_token(CsvSyntaxKind::UnquotedField, start_pos, state.get_position());
129            true
130        }
131        else {
132            false
133        }
134    }
135
136    /// 处理字段分隔符(逗号
137    fn lex_comma(&self, state: &mut State<'_>) -> bool {
138        let start_pos = state.get_position();
139
140        if let Some(ch) = state.peek() {
141            if ch == self.field_separator {
142                state.advance(ch.len_utf8());
143                state.add_token(CsvSyntaxKind::Comma, start_pos, state.get_position());
144                true
145            }
146            else {
147                false
148            }
149        }
150        else {
151            false
152        }
153    }
154}
155
156impl Lexer<CsvLanguage> for CsvLexer {
157    fn lex(&self, source: impl Source) -> LexOutput<CsvLanguage> {
158        let source_text = SourceText::new(source.get_text_in((0..source.length()).into()));
159        let mut state = LexerState::new(&source_text);
160
161        while state.not_at_end() {
162            // 尝试各种词法规则
163            if self.skip_whitespace(&mut state) {
164                continue;
165            }
166
167            if self.lex_newline(&mut state) {
168                continue;
169            }
170
171            if self.lex_comma(&mut state) {
172                continue;
173            }
174
175            if self.lex_quoted_field(&mut state) {
176                continue;
177            }
178
179            if self.lex_unquoted_field(&mut state) {
180                continue;
181            }
182
183            // 如果所有规则都不匹配,跳过当前字符并标记为错误
184            let start_pos = state.get_position();
185            if let Some(ch) = state.peek() {
186                state.advance(ch.len_utf8());
187                state.add_token(CsvSyntaxKind::Error, start_pos, state.get_position());
188            }
189        }
190
191        // 添加 EOF kind
192        let eof_pos = state.get_position();
193        state.add_token(CsvSyntaxKind::Eof, eof_pos, eof_pos);
194
195        state.finish(Ok(()))
196    }
197
198    fn lex_incremental(
199        &self,
200        source: impl Source,
201        _changed: usize,
202        _cache: IncrementalCache<CsvLanguage>,
203    ) -> LexOutput<CsvLanguage> {
204        self.lex(source)
205    }
206}