1use std::io::Write;
2
3pub type Number = f64;
4
5#[derive(Debug,PartialEq)]
7pub enum ScanError {
8 UnknownToken(usize, usize),
10 UnexpectedEof(usize, usize),
13}
14
15impl std::fmt::Display for ScanError {
16 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
17 let (line, offset) = match self {
18 ScanError::UnknownToken(line, offset) => (line, offset),
19 ScanError::UnexpectedEof(line, offset) => (line, offset),
20 };
21 write!(
22 f,
23 "{}:{} : {}",
24 line,
25 offset,
26 match self {
27 ScanError::UnknownToken(_, _) => "unknown token",
28 ScanError::UnexpectedEof(_, _) => "unexpected end of file",
29 }
30 )
31 }
32}
33
34#[derive(Debug, PartialEq)]
35pub enum TokenType {
36 Symbol(String),
38 Identifier(String),
40 StringLiteral(String),
42 NumberLiteral(String, Number),
44 Keyword(String),
46 Comment(String),
48 Ignore,
50 NewLine,
52 Eof,
53 Unknown,
55}
56
57impl TokenType {
58 pub fn len(&self) -> usize {
59 match self {
60 TokenType::Symbol(s) => s.len(),
61 TokenType::Identifier(s) => s.len(),
62 TokenType::StringLiteral(s) => s.len() + 2,
63 TokenType::Keyword(s) => s.len(),
64 TokenType::NumberLiteral(s, _) => s.len(),
65 TokenType::Comment(s) => s.len(),
66 _ => 0,
67 }
68 }
69}
70
71#[derive(Default)]
72pub struct ScannerData {
73 pub source: Vec<char>,
75 pub token_types: Vec<TokenType>,
77 pub token_lines: Vec<usize>,
79 pub token_start: Vec<usize>,
81 pub token_len: Vec<usize>,
86}
87
88impl ScannerData {
89 pub fn dump(&self, out: &mut dyn Write) {
90 for (i, token) in self.token_types.iter().enumerate() {
91 writeln!(out, "[#{:03} line {}] {:?}", i, self.token_lines[i], *token).ok();
92 }
93 }
94}
95
96#[derive(Default)]
97pub struct Scanner {
98 start: usize,
100 current: usize,
102 line: usize,
104}
105
106pub struct ScannerConfig {
107 pub keywords: &'static [&'static str],
109 pub symbols: &'static [&'static str],
111 pub single_line_cmt: Option<&'static str>,
113 pub multi_line_cmt_start: Option<&'static str>,
115 pub multi_line_cmt_end: Option<&'static str>,
117}
118
119impl Scanner {
120 pub fn run(
124 &mut self,
125 source: &str,
126 config: &ScannerConfig,
127 data: &mut ScannerData,
128 ) -> Result<(), ScanError> {
129 data.source = source.chars().collect();
130 self.current = 0;
131 self.line = 1;
132 self.start = self.current;
133 let mut exit = false;
134 while !exit {
135 let token = self.scan_token(data, config)?;
136 match token {
137 TokenType::Eof => exit = true,
138 TokenType::Ignore => self.start = self.current,
139 TokenType::NewLine => (),
140 _ => self.add_token(token, data),
141 }
142 }
143 Ok(())
144 }
145 fn add_token(&mut self, token: TokenType, data: &mut ScannerData) {
146 data.token_start.push(self.start);
147 data.token_len.push(self.current - self.start);
148 data.token_types.push(token);
149 data.token_lines.push(self.line);
150 self.start = self.current;
151 }
152 fn scan_token(
153 &mut self,
154 data: &mut ScannerData,
155 config: &ScannerConfig,
156 ) -> Result<TokenType, ScanError> {
157 if self.current >= data.source.len() {
158 return Ok(TokenType::Eof);
159 }
160 if let Some(token) = self.scan_comment(config, data) {
161 return Ok(token);
162 }
163 if let Some(token) = self.scan_newline(data) {
164 return Ok(token);
165 }
166 if let Some(token) = self.scan_space(data) {
167 return Ok(token);
168 }
169 if let Some(token) = self.scan_symbol(data, config) {
170 return Ok(token);
171 }
172 if let Some(token) = self.scan_keyword(data, config) {
173 return Ok(token);
174 }
175 if let Some(token) = self.scan_string(data)? {
176 return Ok(token);
177 }
178 if let Some(token) = self.scan_identifier(data) {
179 return Ok(token);
180 }
181 if let Some(token) = self.scan_number(data) {
182 return Ok(token);
183 }
184 data.token_len.push(1);
185 data.token_start.push(self.current);
186 data.token_types.push(TokenType::Unknown);
187 data.token_lines.push(self.line);
188 let token_id = data.token_len.len() - 1;
189 Err(ScanError::UnknownToken(
190 self.line,
191 data.token_start[token_id],
192 ))
193 }
194 fn scan_comment(
195 &mut self,
196 config: &ScannerConfig,
197 data: &mut ScannerData,
198 ) -> Option<TokenType> {
199 if let Some(multi_start) = config.multi_line_cmt_start {
200 if self.matches(multi_start, data) {
201 if let Some(multi_end) = config.multi_line_cmt_end {
202 return self.scan_multi_line_comment(multi_start, multi_end, data);
203 }
204 }
205 }
206 if let Some(single_start) = config.single_line_cmt {
207 if self.matches(single_start, data) {
208 return self.scan_single_line_comment(data);
209 }
210 }
211 None
212 }
213 fn scan_single_line_comment(&mut self, data: &mut ScannerData) -> Option<TokenType> {
214 let source_len = data.source.len();
215 while self.current < source_len && data.source[self.current] != '\n' {
216 self.current += 1;
217 }
218 let end=self.current;
219 if self.current < source_len {
220 self.current += 1;
221 self.line += 1;
222 }
223 return Some(TokenType::Comment(
224 data.source[self.start..end]
225 .iter()
226 .cloned()
227 .collect::<String>(),
228 ));
229 }
230 fn scan_multi_line_comment(
231 &mut self,
232 multi_start: &str,
233 multi_end: &str,
234 data: &mut ScannerData,
235 ) -> Option<TokenType> {
236 let mut level = 0;
237 let mut in_string = false;
238 let mut escape = false;
239 while self.current < data.source.len() {
240 let c = data.source[self.current];
241 if c == '\n' {
242 self.line += 1;
243 } else if c == '\\' && !escape {
244 escape = true;
245 } else {
246 if c == '\"' && !escape {
247 in_string = !in_string;
248 } else if !in_string {
249 if self.matches(multi_end, data) {
250 level -= 1;
251 self.current += multi_end.len() - 1;
252 if level == 0 {
253 self.current += 1;
254 return Some(TokenType::Comment(
255 data.source[self.start..self.current]
256 .iter()
257 .cloned()
258 .collect::<String>(),
259 ));
260 }
261 } else if self.matches(multi_start, data) {
262 self.current += multi_start.len() - 1;
263 level += 1;
264 }
265 }
266 escape = false;
267 }
268 self.current += 1;
269 }
270 self.add_token(
271 TokenType::Comment(
272 data.source[self.start..self.current - 1]
273 .iter()
274 .cloned()
275 .collect::<String>(),
276 ),
277 data,
278 );
279 Some(TokenType::Eof)
280 }
281 fn scan_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
282 if is_digit(data.source[self.current]) {
283 let source_len = data.source.len();
284 if self.current < source_len - 2 {
285 if data.source[self.current + 1] == 'x' || data.source[self.current + 1] == 'X' {
286 self.current += 2;
287 return self.scan_hex_number(data);
288 } else if data.source[self.current + 1] == 'b'
289 || data.source[self.current + 1] == 'B'
290 {
291 self.current += 2;
292 return self.scan_binary_number(data);
293 }
294 }
295 let mut number = 0.0;
296 let mut value = String::new();
297 while self.current < source_len && is_digit(data.source[self.current]) {
298 let c = data.source[self.current];
299 value.push(c);
300 number = number * 10.0 + Number::from((c as u8) - b'0');
301 self.current += 1;
302 }
303 if self.current < source_len - 1
304 && data.source[self.current] == '.'
305 && is_digit(data.source[self.current + 1])
306 {
307 self.current += 1;
308 value.push('.');
309 let mut div = 1.0;
310 while self.current < source_len && is_digit(data.source[self.current]) {
311 let c = data.source[self.current];
312 value.push(c);
313 number = number * 10.0 + Number::from((c as u8) - b'0');
314 self.current += 1;
315 div *= 10.0;
316 }
317 number /= div;
318 }
319 return Some(TokenType::NumberLiteral(value, number));
320 }
321 None
322 }
323 fn scan_binary_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
324 let mut number = 0.0;
325 let mut value = String::new();
326 loop {
327 let c = data.source[self.current];
328 match c {
329 '0' | '1' => {
330 number = number * 2.0 + Number::from((c as u8) - b'0');
331 value.push(c);
332 }
333 _ => break,
334 }
335 self.current += 1;
336 if self.current == data.source.len() {
337 break;
338 }
339 }
340 Some(TokenType::NumberLiteral(format!("0b{}", value), number))
341 }
342 fn scan_hex_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
343 let mut number = 0.0;
344 let mut value = String::new();
345 loop {
346 let c = data.source[self.current];
347 match c {
348 '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
349 number = number * 16.0 + Number::from((c as u8) - b'0');
350 value.push(c);
351 }
352 'a' | 'b' | 'c' | 'd' | 'e' | 'f' => {
353 number = number * 16.0 + Number::from((c as u8) - b'a' + 10);
354 value.push(c);
355 }
356 'A' | 'B' | 'C' | 'D' | 'E' | 'F' => {
357 number = number * 16.0 + Number::from((c as u8) - b'A' + 10);
358 value.push(c);
359 }
360 _ => break,
361 }
362 self.current += 1;
363 if self.current == data.source.len() {
364 break;
365 }
366 }
367 Some(TokenType::NumberLiteral(format!("0x{}", value), number))
368 }
369 fn scan_identifier(&mut self, data: &mut ScannerData) -> Option<TokenType> {
370 if is_alpha(data.source[self.current]) {
371 let mut value = String::new();
372 while self.current < data.source.len() && is_alphanum(data.source[self.current]) {
373 value.push(data.source[self.current]);
374 self.current += 1;
375 }
376 return Some(TokenType::Identifier(value));
377 }
378 None
379 }
380 fn scan_space(&mut self, data: &mut ScannerData) -> Option<TokenType> {
381 let start = self.current;
382 while self.current < data.source.len() && is_space(data.source[self.current]) {
383 self.current += 1;
384 }
385 if start == self.current {
386 return None;
387 }
388 Some(TokenType::Ignore)
389 }
390 fn scan_string(&mut self, data: &mut ScannerData) -> Result<Option<TokenType>, ScanError> {
391 if data.source[self.current] == '\"' {
392 self.current += 1;
393 let mut escape = false;
394 let mut value = String::new();
395 while self.current < data.source.len() {
396 let c = data.source[self.current];
397 if c == '\\' && !escape {
398 escape = true;
399 } else {
400 if c == '\"' && !escape {
401 self.current += 1;
402 return Ok(Some(TokenType::StringLiteral(value)));
403 } else if c == 'n' && escape {
404 value.push('\n');
405 } else if c == 't' && escape {
406 value.push('\t');
407 } else {
408 value.push(c);
409 if c == '\n' {
410 self.line += 1;
411 }
412 }
413 escape = false;
414 }
415 self.current += 1;
416 }
417 data.token_len.push(data.source.len() - self.start + 1);
418 data.token_start.push(self.start);
419 data.token_types.push(TokenType::StringLiteral(value));
420 data.token_lines.push(self.line);
421 let token_id = data.token_len.len() - 1;
422 return Err(ScanError::UnexpectedEof(
423 self.line,
424 data.token_start[token_id],
425 ));
426 }
427 Ok(None)
428 }
429 fn scan_newline(&mut self, data: &ScannerData) -> Option<TokenType> {
430 if data.source[self.current] == '\n' {
431 self.current += 1;
432 self.line += 1;
433 return Some(TokenType::NewLine);
434 }
435 None
436 }
437 fn scan_symbol(&mut self, data: &ScannerData, config: &ScannerConfig) -> Option<TokenType> {
438 for s in config.symbols.iter() {
439 if self.matches(s, data) {
440 self.current += s.len();
441 return Some(TokenType::Symbol((*s).to_owned()));
442 }
443 }
444 None
445 }
446 fn scan_keyword(&mut self, data: &ScannerData, config: &ScannerConfig) -> Option<TokenType> {
447 let source_len = data.source.len();
448 for s in config.keywords.iter() {
449 let keyword_len = s.len();
450 if self.matches(s, data)
451 && (self.current + keyword_len >= source_len
452 || !is_alphanum(data.source[self.current + keyword_len]))
453 {
454 self.current += s.len();
455 return Some(TokenType::Keyword((*s).to_owned()));
456 }
457 }
458 None
459 }
460 fn matches(&self, s: &str, data: &ScannerData) -> bool {
461 let mut check = true;
462 let source_len = data.source.len();
463 for (i, c) in s.chars().enumerate() {
464 if self.current + i >= source_len || data.source[self.current + i] != c {
465 check = false;
466 break;
467 }
468 }
469 check
470 }
471}
472
473fn is_digit(c: char) -> bool {
474 c >= '0' && c <= '9'
475}
476
477fn is_alpha(c: char) -> bool {
478 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
479}
480
481fn is_alphanum(c: char) -> bool {
482 is_digit(c) || is_alpha(c)
483}
484
485fn is_space(c: char) -> bool {
486 c == ' ' || c == '\t' || c == '\r'
487}