1use crate::{kind::CsvSyntaxKind, language::CsvLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, SourceText, lexer::LexOutput, source::Source};
3
4type State<'input> = LexerState<&'input SourceText, CsvLanguage>;
5
6pub struct CsvLexer {
7 field_separator: char,
8 quote_char: char,
9}
10
11impl CsvLexer {
12 pub fn new(_config: CsvLanguage) -> Self {
13 Self { field_separator: ',', quote_char: '"' }
14 }
15
16 pub fn with_separator(mut self, separator: char) -> Self {
17 self.field_separator = separator;
18 self
19 }
20
21 pub fn with_quote_char(mut self, quote: char) -> Self {
22 self.quote_char = quote;
23 self
24 }
25
26 fn skip_whitespace(&self, state: &mut State<'_>) -> bool {
28 let start_pos = state.get_position();
29 let mut found_whitespace = false;
30
31 while let Some(ch) = state.peek() {
32 if ch == ' ' || ch == '\t' {
33 state.advance(ch.len_utf8());
34 found_whitespace = true;
35 }
36 else {
37 break;
38 }
39 }
40
41 if found_whitespace {
42 state.add_token(CsvSyntaxKind::Whitespace, start_pos, state.get_position());
43 true
44 }
45 else {
46 false
47 }
48 }
49
50 fn lex_newline(&self, state: &mut State<'_>) -> bool {
52 let start_pos = state.get_position();
53
54 if let Some(ch) = state.peek() {
55 if ch == '\r' {
56 state.advance(1);
57 if state.peek() == Some('\n') {
59 state.advance(1);
60 }
61 state.add_token(CsvSyntaxKind::Newline, start_pos, state.get_position());
62 true
63 }
64 else if ch == '\n' {
65 state.advance(1);
66 state.add_token(CsvSyntaxKind::Newline, start_pos, state.get_position());
67 true
68 }
69 else {
70 false
71 }
72 }
73 else {
74 false
75 }
76 }
77
78 fn lex_quoted_field(&self, state: &mut State<'_>) -> bool {
80 let start_pos = state.get_position();
81
82 if let Some(ch) = state.peek() {
83 if ch == self.quote_char {
84 state.advance(ch.len_utf8()); while let Some(ch) = state.peek() {
86 if ch == self.quote_char {
87 state.advance(ch.len_utf8());
88 if state.peek() == Some(self.quote_char) {
90 state.advance(self.quote_char.len_utf8()); }
92 else {
93 break;
95 }
96 }
97 else {
98 state.advance(ch.len_utf8());
99 }
100 }
101
102 state.add_token(CsvSyntaxKind::QuotedField, start_pos, state.get_position());
103 true
104 }
105 else {
106 false
107 }
108 }
109 else {
110 false
111 }
112 }
113
114 fn lex_unquoted_field(&self, state: &mut State<'_>) -> bool {
116 let start_pos = state.get_position();
117 let mut found_content = false;
118
119 while let Some(ch) = state.peek() {
120 if ch == self.field_separator || ch == '\n' || ch == '\r' {
121 break;
122 }
123 state.advance(ch.len_utf8());
124 found_content = true;
125 }
126
127 if found_content {
128 state.add_token(CsvSyntaxKind::UnquotedField, start_pos, state.get_position());
129 true
130 }
131 else {
132 false
133 }
134 }
135
136 fn lex_comma(&self, state: &mut State<'_>) -> bool {
138 let start_pos = state.get_position();
139
140 if let Some(ch) = state.peek() {
141 if ch == self.field_separator {
142 state.advance(ch.len_utf8());
143 state.add_token(CsvSyntaxKind::Comma, start_pos, state.get_position());
144 true
145 }
146 else {
147 false
148 }
149 }
150 else {
151 false
152 }
153 }
154}
155
156impl Lexer<CsvLanguage> for CsvLexer {
157 fn lex(&self, source: impl Source) -> LexOutput<CsvLanguage> {
158 let source_text = SourceText::new(source.get_text_in((0..source.length()).into()));
159 let mut state = LexerState::new(&source_text);
160
161 while state.not_at_end() {
162 if self.skip_whitespace(&mut state) {
164 continue;
165 }
166
167 if self.lex_newline(&mut state) {
168 continue;
169 }
170
171 if self.lex_comma(&mut state) {
172 continue;
173 }
174
175 if self.lex_quoted_field(&mut state) {
176 continue;
177 }
178
179 if self.lex_unquoted_field(&mut state) {
180 continue;
181 }
182
183 let start_pos = state.get_position();
185 if let Some(ch) = state.peek() {
186 state.advance(ch.len_utf8());
187 state.add_token(CsvSyntaxKind::Error, start_pos, state.get_position());
188 }
189 }
190
191 let eof_pos = state.get_position();
193 state.add_token(CsvSyntaxKind::Eof, eof_pos, eof_pos);
194
195 state.finish(Ok(()))
196 }
197
198 fn lex_incremental(
199 &self,
200 source: impl Source,
201 _changed: usize,
202 _cache: IncrementalCache<CsvLanguage>,
203 ) -> LexOutput<CsvLanguage> {
204 self.lex(source)
205 }
206}