1#![doc = include_str!("readme.md")]
2pub mod token_type;
3use crate::language::CsvLanguage;
4use oak_core::{Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
5pub use token_type::CsvTokenType;
6
7type State<'a, S> = LexerState<'a, S, CsvLanguage>;
8
9#[derive(Clone)]
10pub struct CsvLexer<'config> {
11 _config: &'config CsvLanguage,
12 field_separator: char,
13 quote_char: char,
14}
15
16impl<'config> Lexer<CsvLanguage> for CsvLexer<'config> {
17 fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl oak_core::LexerCache<CsvLanguage>) -> LexOutput<CsvLanguage> {
18 let mut state = State::new(text);
19 let result = self.run(&mut state);
20 if result.is_ok() {
21 state.add_eof()
22 }
23 state.finish_with_cache(result, cache)
24 }
25}
26
27impl<'config> CsvLexer<'config> {
28 pub fn new(config: &'config CsvLanguage) -> Self {
29 Self { _config: config, field_separator: ',', quote_char: '"' }
30 }
31
32 pub fn with_separator(mut self, separator: char) -> Self {
33 self.field_separator = separator;
34 self
35 }
36
37 pub fn with_quote_char(mut self, quote: char) -> Self {
38 self.quote_char = quote;
39 self
40 }
41
42 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
44 let start_pos = state.get_position();
45 let mut found_whitespace = false;
46
47 while let Some(ch) = state.peek() {
48 if ch == ' ' || ch == '\t' {
49 state.advance(ch.len_utf8());
50 found_whitespace = true
51 }
52 else {
53 break;
54 }
55 }
56
57 if found_whitespace {
58 state.add_token(CsvTokenType::Whitespace, start_pos, state.get_position());
59 true
60 }
61 else {
62 false
63 }
64 }
65
66 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68 let start_pos = state.get_position();
69
70 if let Some(ch) = state.peek() {
71 if ch == '\r' {
72 state.advance(1);
73 if state.peek() == Some('\n') {
75 state.advance(1)
76 }
77 state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
78 true
79 }
80 else if ch == '\n' {
81 state.advance(1);
82 state.add_token(CsvTokenType::Newline, start_pos, state.get_position());
83 true
84 }
85 else {
86 false
87 }
88 }
89 else {
90 false
91 }
92 }
93
94 fn lex_quoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
96 let start_pos = state.get_position();
97
98 if let Some(ch) = state.peek() {
99 if ch == self.quote_char {
100 state.advance(ch.len_utf8()); while let Some(ch) = state.peek() {
102 if ch == self.quote_char {
103 state.advance(ch.len_utf8());
104 if state.peek() == Some(self.quote_char) {
106 state.advance(self.quote_char.len_utf8()); }
108 else {
109 break;
111 }
112 }
113 else {
114 state.advance(ch.len_utf8())
115 }
116 }
117 state.add_token(CsvTokenType::Field, start_pos, state.get_position());
118 true
119 }
120 else {
121 false
122 }
123 }
124 else {
125 false
126 }
127 }
128
129 fn lex_unquoted_field<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131 let start_pos = state.get_position();
132 let mut found_char = false;
133
134 while let Some(ch) = state.peek() {
135 if ch == self.field_separator || ch == '\n' || ch == '\r' {
136 break;
137 }
138 else {
139 state.advance(ch.len_utf8());
140 found_char = true
141 }
142 }
143
144 if found_char {
145 state.add_token(CsvTokenType::Field, start_pos, state.get_position());
146 true
147 }
148 else {
149 false
150 }
151 }
152
153 fn lex_comma<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155 let start_pos = state.get_position();
156
157 if let Some(ch) = state.peek() {
158 if ch == self.field_separator {
159 state.advance(ch.len_utf8());
160 state.add_token(CsvTokenType::Comma, start_pos, state.get_position());
161 true
162 }
163 else {
164 false
165 }
166 }
167 else {
168 false
169 }
170 }
171
172 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
173 while state.not_at_end() {
174 if self.skip_whitespace(state) {
176 continue;
177 }
178
179 if self.lex_newline(state) {
180 continue;
181 }
182
183 if self.lex_comma(state) {
184 continue;
185 }
186
187 if self.lex_quoted_field(state) {
188 continue;
189 }
190
191 if self.lex_unquoted_field(state) {
192 continue;
193 }
194
195 let start_pos = state.get_position();
197 if let Some(ch) = state.peek() {
198 state.advance(ch.len_utf8());
199 state.add_token(CsvTokenType::Error, start_pos, state.get_position())
200 }
201 }
202 Ok(())
203 }
204}