Skip to main content

oak_csv/lexer/
mod.rs

1pub mod token_type;
2use crate::language::CsvLanguage;
3use oak_core::{Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
4pub use token_type::CsvTokenType;
5
6type State<'a, S> = LexerState<'a, S, CsvLanguage>;
7
8#[derive(Clone)]
9pub struct CsvLexer<'config> {
10    _config: &'config CsvLanguage,
11    field_separator: char,
12    quote_char: char,
13}
14
15impl<'config> Lexer<CsvLanguage> for CsvLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl oak_core::LexerCache<CsvLanguage>) -> LexOutput<CsvLanguage> {
17        let mut state = State::new(text);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof();
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> CsvLexer<'config> {
27    pub fn new(config: &'config CsvLanguage) -> Self {
28        Self { _config: config, field_separator: ',', quote_char: '"' }
29    }
30
31    pub fn with_separator(mut self, separator: char) -> Self {
32        self.field_separator = separator;
33        self
34    }
35
36    pub fn with_quote_char(mut self, quote: char) -> Self {
37        self.quote_char = quote;
38        self
39    }
40
41    /// 跳过空白字符
42    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43        let start_pos = state.get_position();
44        let mut found_whitespace = false;
45
46        while let Some(ch) = state.peek() {
47            if ch == ' ' || ch == '\t' {
48                state.advance(ch.len_utf8());
49                found_whitespace = true;
50            }
51            else {
52                break;
53            }
54        }
55
56        if found_whitespace {
57            state.add_token(CsvTokenType::Whitespace, start_pos, state.get_position());
58            true
59        }
60        else {
61            false
62        }
63    }
64
65    /// 处理换行
66    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
67        let start_pos = state.get_position();
68
69        if let Some(ch) = state.peek() {
70            if ch == '\r' {
71                state.advance(1);
72                // 检查是否是 CRLF
73                if state.peek() == Some('\n') {
74                    state.advance(1);
75                }
76                state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
77                true
78            }
79            else if ch == '\n' {
80                state.advance(1);
81                state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
82                true
83            }
84            else {
85                false
86            }
87        }
88        else {
89            false
90        }
91    }
92
93    /// 处理带引号的字段
94    fn lex_quoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some(ch) = state.peek() {
98            if ch == self.quote_char {
99                state.advance(ch.len_utf8()); // 跳过开始引号
100                while let Some(ch) = state.peek() {
101                    if ch == self.quote_char {
102                        state.advance(ch.len_utf8());
103                        // 检查是否是转义引号(双引号)
104                        if state.peek() == Some(self.quote_char) {
105                            state.advance(self.quote_char.len_utf8()); // 跳过转义引号
106                        }
107                        else {
108                            // 结束引号
109                            break;
110                        }
111                    }
112                    else {
113                        state.advance(ch.len_utf8());
114                    }
115                }
116                state.add_token(CsvTokenType::Field, start_pos, state.get_position());
117                true
118            }
119            else {
120                false
121            }
122        }
123        else {
124            false
125        }
126    }
127
128    /// 处理不带引号的字段
129    fn lex_unquoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130        let start_pos = state.get_position();
131        let mut found_char = false;
132
133        while let Some(ch) = state.peek() {
134            if ch == self.field_separator || ch == '\n' || ch == '\r' {
135                break;
136            }
137            else {
138                state.advance(ch.len_utf8());
139                found_char = true;
140            }
141        }
142
143        if found_char {
144            state.add_token(CsvTokenType::Field, start_pos, state.get_position());
145            true
146        }
147        else {
148            false
149        }
150    }
151
152    /// 处理逗号
153    fn lex_comma<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start_pos = state.get_position();
155
156        if let Some(ch) = state.peek() {
157            if ch == self.field_separator {
158                state.advance(ch.len_utf8());
159                state.add_token(CsvTokenType::Comma, start_pos, state.get_position());
160                true
161            }
162            else {
163                false
164            }
165        }
166        else {
167            false
168        }
169    }
170
171    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
172        while state.not_at_end() {
173            // 尝试各种词法规则
174            if self.skip_whitespace(state) {
175                continue;
176            }
177
178            if self.lex_newline(state) {
179                continue;
180            }
181
182            if self.lex_comma(state) {
183                continue;
184            }
185
186            if self.lex_quoted_field(state) {
187                continue;
188            }
189
190            if self.lex_unquoted_field(state) {
191                continue;
192            }
193
194            // 如果所有规则都不匹配,跳过当前字符并标记为错误
195            let start_pos = state.get_position();
196            if let Some(ch) = state.peek() {
197                state.advance(ch.len_utf8());
198                state.add_token(CsvTokenType::Error, start_pos, state.get_position());
199            }
200        }
201        Ok(())
202    }
203}