Skip to main content

oak_csv/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3use crate::language::CsvLanguage;
4use oak_core::{Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
5pub use token_type::CsvTokenType;
6
7type State<'a, S> = LexerState<'a, S, CsvLanguage>;
8
9#[derive(Clone)]
10pub struct CsvLexer<'config> {
11    _config: &'config CsvLanguage,
12    field_separator: char,
13    quote_char: char,
14}
15
16impl<'config> Lexer<CsvLanguage> for CsvLexer<'config> {
17    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl oak_core::LexerCache<CsvLanguage>) -> LexOutput<CsvLanguage> {
18        let mut state = State::new(text);
19        let result = self.run(&mut state);
20        if result.is_ok() {
21            state.add_eof()
22        }
23        state.finish_with_cache(result, cache)
24    }
25}
26
27impl<'config> CsvLexer<'config> {
28    pub fn new(config: &'config CsvLanguage) -> Self {
29        Self { _config: config, field_separator: ',', quote_char: '"' }
30    }
31
32    pub fn with_separator(mut self, separator: char) -> Self {
33        self.field_separator = separator;
34        self
35    }
36
37    pub fn with_quote_char(mut self, quote: char) -> Self {
38        self.quote_char = quote;
39        self
40    }
41
42    /// 跳过空白字符
43    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
44        let start_pos = state.get_position();
45        let mut found_whitespace = false;
46
47        while let Some(ch) = state.peek() {
48            if ch == ' ' || ch == '\t' {
49                state.advance(ch.len_utf8());
50                found_whitespace = true
51            }
52            else {
53                break;
54            }
55        }
56
57        if found_whitespace {
58            state.add_token(CsvTokenType::Whitespace, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// 处理换行
67    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68        let start_pos = state.get_position();
69
70        if let Some(ch) = state.peek() {
71            if ch == '\r' {
72                state.advance(1);
73                // 检查是否是 CRLF
74                if state.peek() == Some('\n') {
75                    state.advance(1)
76                }
77                state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
78                true
79            }
80            else if ch == '\n' {
81                state.advance(1);
82                state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
83                true
84            }
85            else {
86                false
87            }
88        }
89        else {
90            false
91        }
92    }
93
94    /// 处理带引号的字段
95    fn lex_quoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
96        let start_pos = state.get_position();
97
98        if let Some(ch) = state.peek() {
99            if ch == self.quote_char {
100                state.advance(ch.len_utf8()); // 跳过开始引号
101                while let Some(ch) = state.peek() {
102                    if ch == self.quote_char {
103                        state.advance(ch.len_utf8());
104                        // 检查是否是转义引号(双引号)
105                        if state.peek() == Some(self.quote_char) {
106                            state.advance(self.quote_char.len_utf8()); // 跳过转义引号
107                        }
108                        else {
109                            // 结束引号
110                            break;
111                        }
112                    }
113                    else {
114                        state.advance(ch.len_utf8())
115                    }
116                }
117                state.add_token(CsvTokenType::Field, start_pos, state.get_position());
118                true
119            }
120            else {
121                false
122            }
123        }
124        else {
125            false
126        }
127    }
128
129    /// 处理不带引号的字段
130    fn lex_unquoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131        let start_pos = state.get_position();
132        let mut found_char = false;
133
134        while let Some(ch) = state.peek() {
135            if ch == self.field_separator || ch == '\n' || ch == '\r' {
136                break;
137            }
138            else {
139                state.advance(ch.len_utf8());
140                found_char = true
141            }
142        }
143
144        if found_char {
145            state.add_token(CsvTokenType::Field, start_pos, state.get_position());
146            true
147        }
148        else {
149            false
150        }
151    }
152
153    /// 处理逗号
154    fn lex_comma<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155        let start_pos = state.get_position();
156
157        if let Some(ch) = state.peek() {
158            if ch == self.field_separator {
159                state.advance(ch.len_utf8());
160                state.add_token(CsvTokenType::Comma, start_pos, state.get_position());
161                true
162            }
163            else {
164                false
165            }
166        }
167        else {
168            false
169        }
170    }
171
172    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
173        while state.not_at_end() {
174            // 尝试各种词法规则
175            if self.skip_whitespace(state) {
176                continue;
177            }
178
179            if self.lex_newline(state) {
180                continue;
181            }
182
183            if self.lex_comma(state) {
184                continue;
185            }
186
187            if self.lex_quoted_field(state) {
188                continue;
189            }
190
191            if self.lex_unquoted_field(state) {
192                continue;
193            }
194
195            // 如果所有规则都不匹配,跳过当前字符并标记为错误
196            let start_pos = state.get_position();
197            if let Some(ch) = state.peek() {
198                state.advance(ch.len_utf8());
199                state.add_token(CsvTokenType::Error, start_pos, state.get_position())
200            }
201        }
202        Ok(())
203    }
204}