http_rest_file/
scanner.rs

1pub use regex::Regex;
2
3#[derive(PartialEq, Debug)]
4pub struct Scanner {
5    cursor: usize,
6    characters: Vec<char>,
7}
8
9#[derive(PartialEq, Debug)]
10pub enum ScanError {
11    EndOfLine,                     // end of line reached during parsing
12    InvalidRegexCaptureConversion, // regex with capture groups could not be converted
13}
14
15impl From<regex::Error> for ScanError {
16    fn from(_err: regex::Error) -> ScanError {
17        ScanError::InvalidRegexCaptureConversion
18    }
19}
20
21#[derive(Eq, Debug, Clone)]
22pub struct ScannerPos {
23    pub cursor: usize,
24}
25
26impl From<ScannerPos> for usize {
27    fn from(value: ScannerPos) -> Self {
28        value.cursor
29    }
30}
31
32impl PartialEq for ScannerPos {
33    fn eq(&self, other: &Self) -> bool {
34        self.cursor == other.cursor
35    }
36}
37
38impl PartialOrd for ScannerPos {
39    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
40        Some(self.cmp(other))
41    }
42}
43impl Ord for ScannerPos {
44    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
45        self.cursor.cmp(&other.cursor)
46    }
47}
48
49pub struct ErrorContext {
50    pub context: String,
51    pub line: u32,
52    pub column: u32,
53}
54
55#[derive(Debug)]
56pub struct LineIterator<'a> {
57    cursor: usize,
58    characters: &'a [char],
59}
60
61impl<'a> LineIterator<'a> {
62    pub fn take_while_peek<P>(&self, predicate: P) -> (Vec<String>, usize)
63    where
64        Self: Sized,
65        P: Fn(&str) -> bool,
66    {
67        let len = self.characters.len();
68        let mut cursor = self.cursor;
69        let mut peek_cursor = self.cursor;
70
71        let mut lines: Vec<String> = Vec::new();
72
73        loop {
74            if peek_cursor >= len {
75                break;
76            }
77            if self.characters[peek_cursor] == '\n' {
78                let line = &self.characters[cursor..peek_cursor];
79                let line = line.iter().collect::<String>();
80                if !predicate(&line) {
81                    return (lines, cursor);
82                }
83                lines.push(line);
84                cursor = peek_cursor + 1;
85            }
86
87            peek_cursor += 1;
88        }
89
90        (lines, cursor)
91    }
92}
93
94impl<'a> Iterator for LineIterator<'a> {
95    type Item = String;
96    fn next(&mut self) -> Option<Self::Item> {
97        let len: usize = self.characters.len();
98        if self.cursor >= len {
99            return None;
100        }
101        let mut peek_cursor: usize = self.cursor;
102        loop {
103            if peek_cursor >= len || self.characters[peek_cursor] == '\n' {
104                let result = self.characters[self.cursor..peek_cursor]
105                    .iter()
106                    .collect::<String>();
107                self.cursor = peek_cursor;
108                return Some(result);
109            }
110            peek_cursor += 1;
111        }
112    }
113}
114
115// whitespace character which are not newlines
116pub const WS_CHARS: [char; 4] = [' ', '\t', '\r', '\u{000C}'];
117
118impl Scanner {
119    pub fn new(string: &str) -> Scanner {
120        Scanner {
121            cursor: 0,
122            characters: string.chars().collect(),
123        }
124    }
125
126    pub fn iter_at_pos(&mut self) -> LineIterator {
127        LineIterator {
128            characters: &self.characters[..],
129            cursor: self.cursor,
130        }
131    }
132
133    pub fn set_pos<T: Into<usize>>(&mut self, position: T) {
134        self.cursor = position.into();
135    }
136
137    pub fn get_pos(&self) -> ScannerPos {
138        ScannerPos {
139            cursor: self.cursor,
140        }
141    }
142
143    pub fn get_cursor(&self) -> usize {
144        self.cursor
145    }
146
147    pub fn get_error_context(&self, start_pos: usize, end_pos: Option<usize>) -> ErrorContext {
148        let mut line = 0;
149        let mut last_newline_pos = 0;
150        for (index, char) in self.characters[..start_pos].iter().enumerate() {
151            if char == &'\n' {
152                line += 1;
153                last_newline_pos = index;
154            }
155        }
156
157        let column = start_pos - last_newline_pos;
158
159        let context = if let Some(end_pos) = end_pos {
160            self.characters[start_pos..end_pos]
161                .iter()
162                .collect::<String>()
163        } else {
164            self.characters[last_newline_pos..start_pos]
165                .iter()
166                .collect::<String>()
167        };
168
169        ErrorContext {
170            line,
171            column: column as u32,
172            context,
173        }
174    }
175
176    pub fn get_from_to<S: Into<usize>, E: Into<usize>>(&self, start: S, end: E) -> String {
177        let start: usize = start.into();
178        let end = end.into();
179        if start == end {
180            return String::new();
181        }
182        self.characters[start..end].iter().collect::<String>()
183    }
184
185    // Return the character under the cursor without advancing
186    pub fn peek(&self) -> Option<&char> {
187        self.characters.get(self.cursor)
188    }
189
190    // Return the next n characters or none if not enough characters are present
191    pub fn peek_n(&self, num: usize) -> Option<Vec<char>> {
192        if self.cursor + num > self.characters.len() {
193            return None;
194        }
195        Some(self.characters[self.cursor..(self.cursor + num)].to_vec())
196    }
197
198    // Checks if we scanned the whole file
199    pub fn is_done(&self) -> bool {
200        self.cursor >= self.characters.len()
201    }
202
203    // Get the character under the cursor and advance. None is returned if we are at the end of the
204    // file.
205    pub fn next_char(&mut self) -> Option<&char> {
206        match self.characters.get(self.cursor) {
207            Some(character) => {
208                self.cursor += 1;
209                Some(character)
210            }
211            None => None,
212        }
213    }
214
215    // Advance the cursor if the supplied character matches. Returns true if a match and advance
216    // occurred, false otherwise
217    pub fn take(&mut self, character: &char) -> bool {
218        match self.characters.get(self.cursor) {
219            Some(current) => {
220                if current == character {
221                    self.cursor += 1;
222                    true
223                } else {
224                    false
225                }
226            }
227            None => false,
228        }
229    }
230
231    /// Get the next non whitespace character under the cursor without advancing.
232    /// Following characters are skipped: space, tab, form feed, carriage return
233    #[allow(dead_code)]
234    pub fn peek_skip_ws(&self) -> Option<char> {
235        let mut peek_cursor = self.cursor;
236        // whitespace is regular space, tab, carriage return and form feed
237        loop {
238            if peek_cursor >= self.characters.len() {
239                return None;
240            }
241            let char: char = self.characters[peek_cursor];
242            if !WS_CHARS.iter().any(|ch| *ch == char) {
243                return Some(self.characters[peek_cursor]);
244            }
245            peek_cursor += 1;
246        }
247    }
248
249    /// Skip whitespace characters which are not new lines. Following characters are skipped: space, tab, form feed, carriage return
250    pub fn skip_ws(&mut self) {
251        loop {
252            if self.cursor >= self.characters.len() {
253                return;
254            }
255            let char: char = self.characters[self.cursor];
256
257            if !WS_CHARS.iter().any(|ch| *ch == char) {
258                return;
259            }
260            self.cursor += 1;
261        }
262    }
263
264    // Skip empty lines, lines containing whitespace are not skipped
265    pub fn skip_empty_lines(&mut self) {
266        loop {
267            match self.peek() {
268                Some('\n') => {
269                    self.next_char();
270                }
271                _ => return,
272            }
273        }
274    }
275
276    pub fn skip_empty_lines_and_ws(&mut self) {
277        loop {
278            let pos = self.get_pos();
279            self.skip_empty_lines();
280            self.skip_ws();
281            if self.get_pos().cursor == pos.cursor {
282                break;
283            }
284        }
285    }
286
287    /// Tries to match the given string and if successful moves the cursor to the next
288    /// position after the strings and returns if matched or not
289    /// If cursor is at the end of the file, nothing can be matched and always false will be
290    /// returned.
291    /// matching the empty string "" will always return in a match without moving the cursor
292    /// forward.
293    pub fn match_str_forward(&mut self, str: &str) -> bool {
294        let chars = str.chars().collect::<Vec<char>>();
295        let sequence = chars.as_slice();
296
297        let mut peek_cursor = self.cursor;
298        let mut sequence_cursor = 0;
299        let seq_len = sequence.len();
300        let end_index = self.characters.len();
301
302        let matches_str = loop {
303            if sequence_cursor >= seq_len {
304                break true;
305            }
306            if peek_cursor >= end_index {
307                break false;
308            }
309            let current_char: char = self.characters[peek_cursor];
310            if current_char != sequence[sequence_cursor] {
311                break false;
312            }
313            sequence_cursor += 1;
314            peek_cursor += 1;
315        };
316        if matches_str {
317            self.cursor = peek_cursor;
318        }
319        matches_str
320    }
321
322    pub fn seek_return(&mut self, character: &char) -> Result<String, ScanError> {
323        let start: usize = self.cursor;
324        loop {
325            if self.cursor >= self.characters.len() {
326                return Err(ScanError::EndOfLine);
327            }
328            if self.characters[self.cursor] == *character {
329                let string = self.characters[start..self.cursor].iter().collect();
330                self.cursor += 1;
331                return Ok(string);
332            }
333            self.cursor += 1;
334        }
335    }
336
337    // Tries to match a regex from the current position of the scanner (cursor) forward
338    // if it matches Ok result is returned with a list of matches. If the string contained capture groups then
339    // a list of captured strings is returned, an empty list otherwise if the regex matched but no
340    // capture groups were present. If the regex does not contain a regex that starts at the
341    // beginning of the string then the `^` symbol is added. If a match occurs the cursor is moved
342    // forward. If no match occurs None is returned (no matter if capture groups were provided).
343    pub fn match_regex_forward(
344        &mut self,
345        user_regex_str: &str,
346    ) -> Result<Option<Vec<String>>, ScanError> {
347        if self.cursor >= self.characters.len() {
348            return Err(ScanError::EndOfLine);
349        }
350
351        // we only want to match from the current position forward, therefore add regex start of
352        // string symbol ^
353        let mut regex_str: String = user_regex_str.to_owned();
354        if !regex_str.starts_with('^') {
355            regex_str = format!("^{}", user_regex_str);
356        }
357        let regex = regex::bytes::Regex::new(&regex_str)?;
358
359        let string_tmp = self.characters[self.cursor..].iter().collect::<String>();
360        let bytes = string_tmp.as_bytes();
361        return match regex.captures(bytes) {
362            Some(comment_captures) => {
363                let mut str_captures: Vec<String> = Vec::new();
364
365                for (i, capture) in comment_captures.iter().enumerate() {
366                    // first match is full string
367                    // if we got a match we adjust the cursor otherwise we don't
368                    if i == 0 {
369                        let matched_str = std::str::from_utf8(capture.unwrap().as_bytes()).unwrap();
370                        let num_chars = matched_str.chars().count();
371                        self.cursor += num_chars;
372                    } else {
373                        let capture_bytes: Vec<u8> = capture.unwrap().as_bytes().to_owned();
374                        match String::from_utf8(capture_bytes) {
375                            Ok(string) => {
376                                str_captures.push(string);
377                            }
378                            Err(_) => return Err(ScanError::InvalidRegexCaptureConversion),
379                        }
380                    }
381                }
382                return Ok(Some(str_captures));
383            }
384            None => Ok(None),
385        };
386    }
387
388    /// Get the current line (excluding the new line character) and advance to the next.
389    pub fn get_line_and_advance(&mut self) -> Option<String> {
390        let mut peek_cursor = self.cursor;
391        let num_chars = self.characters.len();
392        if self.is_done() {
393            return None;
394        }
395
396        let line = loop {
397            if peek_cursor >= num_chars || self.characters[peek_cursor] == '\n' {
398                break self.characters[self.cursor..peek_cursor]
399                    .iter()
400                    .collect::<String>();
401            }
402            peek_cursor += 1;
403        };
404
405        // skip \n character
406        if peek_cursor < num_chars {
407            peek_cursor += 1;
408        }
409
410        self.cursor = peek_cursor;
411
412        Some(line)
413    }
414
415    pub fn peek_line(&mut self) -> Option<String> {
416        if self.is_done() {
417            return None;
418        }
419
420        let mut peek_cursor = self.cursor;
421        let len = self.characters.len();
422
423        while peek_cursor < len && self.characters[peek_cursor] != '\n' {
424            peek_cursor += 1;
425        }
426
427        Some(
428            self.characters[self.cursor..peek_cursor]
429                .iter()
430                .collect::<String>(),
431        )
432    }
433
434    pub fn skip_to_next_line(&mut self) {
435        loop {
436            if self.is_done() {
437                return;
438            }
439            if self.characters[self.cursor] == '\n' {
440                self.cursor += 1;
441                return;
442            }
443            self.cursor += 1;
444        }
445    }
446
447    pub fn get_tokens(&self) -> Vec<String> {
448        // @TODO check whitespace
449        self.characters
450            .iter()
451            .collect::<String>()
452            .split_whitespace()
453            .map(|s| s.to_string())
454            .collect()
455    }
456
457    /// Return the previous line's bounds (start and end position)
458    fn get_prev_line_bounds(&self) -> Option<(usize, usize)> {
459        if self.cursor == 0 {
460            return None;
461        }
462        let mut line_end = self.cursor - 1;
463        loop {
464            // no previous line found
465            if line_end == 0 {
466                return None;
467            }
468            // found marker for previous line
469            if self.characters[line_end] == '\n' {
470                break;
471            }
472
473            line_end -= 1;
474        }
475
476        let mut line_start = line_end - 1;
477        loop {
478            if line_start == 0 {
479                break;
480            }
481            if self.characters[line_start] == '\n' {
482                // we found the previous line but move cursor after newline character
483                line_start += 1;
484                break;
485            }
486            line_start -= 1;
487        }
488        if line_start > line_end {
489            line_start = line_end;
490        }
491
492        Some((line_start, line_end))
493    }
494
495    /// Return the previous line without moving the cursor position
496    pub fn get_prev_line(&self) -> Option<String> {
497        let (line_start, line_end) = self.get_prev_line_bounds()?;
498        if line_start == line_end {
499            return Some("".to_string());
500        }
501        return Some(
502            self.characters[line_start..line_end]
503                .iter()
504                .collect::<String>(),
505        );
506    }
507
508    /// Change the position to the start of the new line if exists, otherwise do nothing
509    pub fn step_to_previous_line_start(&mut self) {
510        if let Some((line_start, _)) = self.get_prev_line_bounds() {
511            self.cursor = line_start;
512        }
513    }
514}
515
516// only for debugging
517#[allow(dead_code)]
518#[cfg(debug_assertions)]
519impl Scanner {
520    pub fn debug_string(&self) -> String {
521        let before: String = self.characters[..self.cursor].iter().collect();
522
523        let current: String = self
524            .characters
525            .get(self.cursor)
526            .map_or("".to_string(), |c| c.to_string());
527
528        let after: String = if self.cursor >= self.characters.len() - 1 {
529            String::new()
530        } else {
531            self.characters[self.cursor + 1..].iter().collect()
532        };
533        format!("{}[{}]{}", before, current, after)
534    }
535}
536
537#[cfg(test)]
538mod tests {
539
540    use super::*;
541
542    #[test]
543    pub fn seek_return() {
544        let string = "abc def    ghi\n\n next line";
545        let mut scanner = Scanner::new(string);
546
547        match scanner.seek_return(&'\n') {
548            Ok(result) => {
549                assert_eq!(result, "abc def    ghi");
550                assert_eq!(
551                    scanner.cursor, 15,
552                    "position should be right after new line"
553                );
554            }
555            err => panic!("invalid result: {:?}", err),
556        }
557
558        match scanner.seek_return(&'\n') {
559            Ok(result) => {
560                assert_eq!(result, "");
561                assert_eq!(scanner.cursor, 16);
562            }
563            err => panic!("invalid result: {:?}", err),
564        }
565    }
566
567    #[test]
568    pub fn seek_return_missing() {
569        let string = "abc def    ghi";
570        let mut scanner = Scanner::new(string);
571
572        match scanner.seek_return(&'\n') {
573            Ok(_) => panic!("should not have found missing new line"),
574
575            Err(err) => {
576                assert_eq!(err, ScanError::EndOfLine);
577            }
578        }
579    }
580
581    #[test]
582    pub fn get_line_and_advance() {
583        let string = "First line\n    Next Line  \n";
584        let mut scanner = Scanner::new(string);
585
586        let line = scanner.get_line_and_advance();
587        assert_eq!(line, Some("First line".to_string()));
588        assert_eq!(scanner.cursor, 11);
589
590        let next = scanner.get_line_and_advance();
591        assert_eq!(next, Some("    Next Line  ".to_string()));
592        assert!(scanner.is_done());
593        assert_eq!(scanner.cursor, string.len());
594
595        // at the end, None is returned
596        let next = scanner.get_line_and_advance();
597        assert!(next.is_none());
598        assert!(scanner.is_done());
599        assert!(scanner.cursor == string.len());
600    }
601
602    #[test]
603    pub fn skip_to_next_line() {
604        let string = "First line\nSecond Line\n\n";
605        let mut scanner = Scanner::new(string);
606        assert_eq!(scanner.cursor, 0);
607
608        scanner.skip_to_next_line();
609        assert_eq!(scanner.cursor, 11);
610        scanner.skip_to_next_line();
611        assert_eq!(scanner.cursor, 23);
612        scanner.skip_to_next_line();
613        assert_eq!(scanner.cursor, 24);
614        assert_eq!(scanner.cursor, string.len());
615        assert!(scanner.is_done());
616    }
617
618    #[test]
619    pub fn skip_empty_lines() {
620        let string = "0\n\n\n4";
621        let mut scanner = Scanner::new(string);
622
623        scanner.skip_empty_lines();
624        assert_eq!(scanner.cursor, 0);
625
626        scanner.next_char();
627        assert_eq!(scanner.cursor, 1);
628
629        scanner.skip_empty_lines();
630        assert_eq!(scanner.cursor, 4);
631    }
632
633    #[test]
634    pub fn skip_ws() {
635        let string = "0     \r \t \u{000C}  1";
636        let mut scanner = Scanner::new(string);
637
638        // don't skip non whitespace
639        scanner.skip_ws();
640        assert_eq!(scanner.cursor, 0);
641
642        scanner.next_char();
643        scanner.skip_ws();
644        let last_char = scanner.peek().unwrap();
645        assert_eq!(*last_char, '1');
646    }
647
648    #[test]
649    pub fn match_str_forward() {
650        let string = "012   \nTest line";
651        let mut scanner = Scanner::new(string);
652
653        // don't skip non whitespace
654        assert!(scanner.match_str_forward("012"));
655        assert_eq!(scanner.cursor, 3);
656
657        assert!(scanner.match_str_forward("   \n"));
658        assert_eq!(scanner.cursor, 7);
659
660        assert!(!scanner.match_str_forward("No match"));
661        assert_eq!(scanner.cursor, 7);
662
663        assert!(scanner.match_str_forward("Test line"));
664        assert!(scanner.is_done());
665
666        assert!(!scanner.match_str_forward("No match"));
667
668        assert!(scanner.match_str_forward(""));
669    }
670
671    #[test]
672    pub fn take() {
673        let string = "0 \n";
674        let mut scanner = Scanner::new(string);
675
676        assert_eq!(scanner.cursor, 0);
677        assert!(scanner.take(&'0'));
678        assert_eq!(scanner.cursor, 1);
679        assert!(scanner.take(&' '));
680        assert_eq!(scanner.cursor, 2);
681        assert!(scanner.take(&'\n'));
682        assert_eq!(scanner.cursor, 3);
683        assert!(scanner.is_done());
684        assert!(!scanner.take(&' '));
685    }
686
687    #[test]
688    pub fn peek() {
689        let string = "0 \n";
690        let mut scanner = Scanner::new(string);
691
692        assert_eq!(scanner.peek(), Some(&'0'));
693        assert_eq!(scanner.cursor, 0);
694
695        scanner.next_char();
696        assert_eq!(scanner.peek(), Some(&' '));
697        assert_eq!(scanner.cursor, 1);
698
699        scanner.next_char();
700        assert_eq!(scanner.peek(), Some(&'\n'));
701        assert_eq!(scanner.cursor, 2);
702
703        scanner.next_char();
704
705        // we are at the end
706        assert_eq!(scanner.peek(), None);
707        assert!(scanner.is_done());
708    }
709
710    #[test]
711    pub fn match_regex_forward_only_at_start() {
712        let string = "### 000 123 456 ";
713        let mut scanner = Scanner::new(string);
714
715        // the regex should only match from the beginning of the string and not within
716        // no match should return None
717        let matches = scanner.match_regex_forward("123").unwrap();
718        assert_eq!(matches, None);
719        let mut scanner = Scanner::new(string);
720        let matches = scanner.match_regex_forward("^123").unwrap();
721        assert_eq!(matches, None);
722
723        // here we match the regex but no capture group was provided, so return should be
724        // Ok(Some([]))
725        let mut scanner = Scanner::new(string);
726        let matches = scanner.match_regex_forward("###").unwrap().unwrap();
727        assert_eq!(matches.len(), 0);
728
729        let mut scanner = Scanner::new(string);
730        let matches = scanner.match_regex_forward("^###").unwrap().unwrap();
731        assert_eq!(matches.len(), 0);
732
733        // we match and have a capture group
734        let mut scanner = Scanner::new(string);
735        let matches = scanner
736            .match_regex_forward("### (\\d\\d\\d)")
737            .unwrap()
738            .unwrap();
739        assert_eq!(matches, vec!["000"]);
740
741        // we move the cursor forward and should only match from the current position and not the
742        // start!
743        let mut scanner = Scanner::new(string);
744        scanner.match_str_forward("### ");
745        let matches = scanner.match_regex_forward("###").unwrap();
746        assert_eq!(matches, None);
747        // no matches from the start as the cursor has been moved forward
748        let matches = scanner.match_regex_forward("###").unwrap();
749        assert_eq!(matches, None);
750        // now we match from the current cursor forward
751        let matches = scanner.match_regex_forward("(000)").unwrap().unwrap();
752        assert_eq!(matches, vec!["000"]);
753    }
754
755    #[test]
756    pub fn match_regex_forward_no_captures() {
757        let string = "000 123 456 | abc def ghi | \n\t\r\n end";
758        let mut scanner = Scanner::new(string);
759
760        // we should get ok, and an empty list of matches as we have no capture groups
761        let mut matches = scanner
762            .match_regex_forward("[0-9]{3} [0-9]{3} 456")
763            .unwrap()
764            .unwrap();
765        let empty: Vec<String> = Vec::new();
766        assert_eq!(matches, empty);
767
768        _ = scanner.match_regex_forward(" \\| ");
769
770        matches = scanner
771            .match_regex_forward("(abc) [a-z]{3} (ghi)")
772            .unwrap()
773            .unwrap();
774        assert_eq!(matches, vec!["abc", "ghi"]);
775
776        _ = scanner.match_regex_forward(" \\| ");
777
778        matches = scanner.match_regex_forward("\n(\t\r)\n ").unwrap().unwrap();
779
780        assert_eq!(matches, ["\t\r".to_string()]);
781    }
782
783    #[test]
784    pub fn get_prev_line_bounds() {
785        let string = "abc\ndef\n\n\n";
786        let mut scanner = Scanner::new(string);
787        assert_eq!(scanner.get_prev_line_bounds(), None);
788
789        scanner.skip_to_next_line();
790        assert_eq!(scanner.get_prev_line_bounds(), Some((0, 3)));
791        scanner.skip_to_next_line();
792        assert_eq!(scanner.get_prev_line_bounds(), Some((4, 7)));
793    }
794}