Skip to main content

irox_tools/util/
scanner.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2025 IROX Contributors
3//
4
5//!
6//! A utility to scan for tokens in a byte stream
7//!
8
9extern crate alloc;
10
11use crate::hex;
12use crate::read::Buffer;
13use alloc::borrow::Cow;
14use alloc::collections::VecDeque;
15use std::io::{BufReader, Read};
16
17///
18/// What characters are considered "quotes"
19#[derive(Debug, Copy, Clone, Default)]
20pub enum QuotedChars {
21    /// Matches " or '
22    #[default]
23    SingleOrDoubleQuotes,
24    /// Matches only "
25    DoubleQuotes,
26    /// Matches only '
27    SingleQuotes,
28    /// Matches only the specified character
29    Other(u8),
30}
31
32///
33/// A token represents a searching string in the input data stream.  If the
34/// sequence of bytes `search` is found, then the response will be `response`.
35///
36/// Can optionally provide an `escape_char`, which will preclude `search` from
37/// matching if it is immediately preceded by that character.
38///
39/// Can optionally provide an `quote_char` to indicate that `search` should be
40/// precluded if wrapped in those characters.
41#[derive(Clone)]
42pub struct Token<T: Clone> {
43    search: Vec<u8>,
44    response: T,
45    escape_char: Option<u8>,
46    quote_char: Option<QuotedChars>,
47}
48
49impl<T: Clone> Token<T> {
50    pub fn new<S: AsRef<[u8]>>(search: S, response: T) -> Self {
51        Token {
52            search: search.as_ref().to_owned(),
53            response,
54            escape_char: None,
55            quote_char: None,
56        }
57    }
58    #[must_use]
59    pub fn with_escape_char(self, escape: u8) -> Self {
60        Token {
61            search: self.search,
62            response: self.response,
63            quote_char: self.quote_char,
64            escape_char: Some(escape),
65        }
66    }
67    #[must_use]
68    pub fn with_quote_char(self, quote_char: QuotedChars) -> Self {
69        Token {
70            search: self.search,
71            response: self.response,
72            quote_char: Some(quote_char),
73            escape_char: self.escape_char,
74        }
75    }
76
77    #[must_use]
78    pub fn get_search(&self) -> &[u8] {
79        self.search.as_ref()
80    }
81
82    #[must_use]
83    pub fn get_response(&self) -> &T {
84        &self.response
85    }
86}
87
88///
89/// Used as a return type to provide:
90/// `Found` if a token was found, which token, and where
91/// `EndOfData` the scanner hit the end of the buffer/EOF before finding the token
92/// `NotFound` if there is no more data in the buffer
93pub enum FoundToken<'a, T: Clone> {
94    Found { offset: usize, token: &'a Token<T> },
95    EndOfData { remaining_length: usize },
96    NotFound,
97}
98
99///
100/// Used as a return type to provide:
101/// `Found` if the token was found, which token, and the data preceding it
102/// `EndOfData` if the scanner hit EOF and found no token
103/// `NotFound` if there is no more data in the boffer
104pub enum ReadToken<'a, T: Clone> {
105    Found { data: Vec<u8>, token: &'a Token<T> },
106    EndOfData { data: Vec<u8> },
107    NotFound,
108}
109impl<T: Clone> ReadToken<'_, T> {
110    pub fn get_data(self) -> Option<Vec<u8>> {
111        match self {
112            ReadToken::Found { data, .. } | ReadToken::EndOfData { data, .. } => Some(data),
113            ReadToken::NotFound => None,
114        }
115    }
116    pub fn as_str(&self) -> Cow<'_, str> {
117        match self {
118            ReadToken::Found { data, .. } | ReadToken::EndOfData { data, .. } => {
119                String::from_utf8_lossy(data.as_slice())
120            }
121            ReadToken::NotFound => Default::default(),
122        }
123    }
124}
125
126struct TokenWorkingMem<'a, T: Clone> {
127    token: &'a Token<T>,
128    ringbuf: VecDeque<u8>,
129    found_escape: bool,
130    last_found_quote_char: Option<u8>,
131    offset: usize,
132}
133impl<'a, T: Clone> TokenWorkingMem<'a, T> {
134    pub fn new(token: &'a Token<T>) -> Self {
135        TokenWorkingMem {
136            token,
137            ringbuf: VecDeque::with_capacity(token.search.len()),
138            found_escape: false,
139            last_found_quote_char: None,
140            offset: 0,
141        }
142    }
143    pub fn reset(&mut self) {
144        self.ringbuf.clear();
145        self.found_escape = false;
146        self.last_found_quote_char = None;
147        self.offset = 0;
148    }
149    /// Is there any unfilled capacity?
150    pub fn is_full(&self) -> bool {
151        self.ringbuf.capacity() - self.ringbuf.len() == 0
152    }
153
154    /// Process the new element
155    pub fn push_back(&mut self, elem: u8) {
156        if !self.is_full() {
157            // it's not full yet, just push the new element in and skip out.
158            self.ringbuf.push_back(elem);
159            return;
160        }
161        // we are full, grab the front element, check it for escape and quotes
162        // and then append the new guy.
163        let ret = self.ringbuf.pop_front();
164
165        if let Some(first) = ret {
166            self.offset += 1;
167            if let Some(esc) = self.token.escape_char {
168                self.found_escape = first == esc;
169            }
170
171            if let Some(quoted) = self.token.quote_char {
172                //
173                // the following logic is a little bit complex, because
174                // SingleOrDoubleQuotes can match either ' or ", but we don't
175                // want to match the opposite character, IE, a ' won't terminate
176                // a " run.
177
178                if let Some(last_char) = self.last_found_quote_char {
179                    // if we're searching for the next quote character
180                    if last_char == first {
181                        // and we've found it - clear the flag
182                        self.last_found_quote_char = None;
183                    }
184                } else if match quoted {
185                    // if we're not searching, check against the possible quote
186                    // chars to see if we've found a start
187                    QuotedChars::SingleOrDoubleQuotes => first == b'\'' || first == b'\"',
188                    QuotedChars::DoubleQuotes => first == b'\"',
189                    QuotedChars::SingleQuotes => first == b'\'',
190                    QuotedChars::Other(o) => first == o,
191                } {
192                    // we've found a start, flag it.
193                    self.last_found_quote_char = Some(first);
194                }
195            }
196        }
197
198        self.ringbuf.push_back(elem);
199    }
200
201    /// Have we found the token?
202    pub fn matches(&self) -> bool {
203        if !self.is_full() {
204            return false;
205        }
206        if self.found_escape {
207            // escape character found, we'll never match the token.
208            return false;
209        }
210        if self.last_found_quote_char.is_some() {
211            // we're in a quoted sequence, we'll never match the token.
212            return false;
213        }
214        self.ringbuf.iter().eq(&self.token.search)
215    }
216}
217
218///
219/// A Scanner is a forward lookahead struct that scans through an stream of
220/// data looking for the indicated tokens.
221///
222/// The amount of possible forward lookahead is specified by the internal buffer
223/// size of the [`BufReader`]
224pub struct Scanner<T, R>
225where
226    T: Read + Sized,
227    R: Clone,
228{
229    reader: Buffer<BufReader<T>>,
230    tokens: Vec<Token<R>>,
231    skip_empty_data: bool,
232}
233
234impl<T: Read + Sized, R: Clone> Scanner<T, R> {
235    ///
236    /// Creates a scanner with the default buffer capacity, 8KB
237    pub fn new(input: T, delimiters: &[Token<R>]) -> Self {
238        Scanner {
239            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
240            tokens: Vec::from(delimiters),
241            skip_empty_data: false,
242        }
243    }
244
245    ///
246    /// Creates a scanner with the specified buffer capacity
247    pub fn with_max_lookahead(input: T, max_buffer: usize, delimiters: &[Token<R>]) -> Self {
248        Scanner {
249            reader: Buffer::new(BufReader::with_capacity(max_buffer, input)),
250            tokens: Vec::from(delimiters),
251            skip_empty_data: false,
252        }
253    }
254
255    ///
256    /// Scans through the buffer, looking for the specified token.  Returns the
257    /// number of bytes in the stream needed to position the cursor to JUST BEFORE
258    /// the token.  I.E., after calling `read_exact(scan_until())`, the next
259    /// call to `read()` will return the token itself.
260    ///
261    /// Returns `Ok(N)` if it found the token in the input stream, or hit the end of the buffer without finding the token
262    /// Returns `Ok(None)` if there are no additional characters to read in the buffer - we've hit EOF.
263    /// Returns `Err(e)` if there's an error reading from the underlying stream
264    pub fn scan_until_next(&mut self) -> Result<FoundToken<'_, R>, std::io::Error> {
265        let mut workingmem: Vec<TokenWorkingMem<R>> =
266            self.tokens.iter().map(TokenWorkingMem::new).collect();
267        let mut num_read = 0;
268        while let Some(char) = self.reader.next() {
269            let mut reset = false;
270            for mem in &mut workingmem {
271                mem.push_back(char);
272
273                if mem.matches() {
274                    if self.skip_empty_data && (self.reader.is_empty() || mem.offset == 0) {
275                        reset = true;
276                        self.reader.consume_read_buffer();
277                        break;
278                    }
279                    return Ok(FoundToken::Found {
280                        offset: mem.offset,
281                        token: mem.token,
282                    });
283                }
284            }
285            if reset {
286                workingmem.iter_mut().for_each(TokenWorkingMem::reset);
287            }
288            num_read += 1;
289        }
290        Ok(FoundToken::EndOfData {
291            remaining_length: num_read,
292        })
293    }
294
295    pub fn read_next(&mut self) -> Result<ReadToken<'_, R>, std::io::Error> {
296        let mut workingmem: Vec<TokenWorkingMem<R>> =
297            self.tokens.iter().map(TokenWorkingMem::new).collect();
298
299        while let Some(char) = self.reader.next() {
300            let mut reset = false;
301            for mem in &mut workingmem {
302                mem.push_back(char);
303                if mem.matches() {
304                    if self.skip_empty_data && (self.reader.is_empty() || mem.offset == 0) {
305                        reset = true;
306                        self.reader.consume_read_buffer();
307                        break;
308                    }
309                    let buf = self.reader.consume_read_buffer();
310                    let mut data: Vec<u8> = buf.into();
311                    data.truncate(mem.offset);
312                    return Ok(ReadToken::Found {
313                        data,
314                        token: mem.token,
315                    });
316                }
317            }
318            if reset {
319                workingmem.iter_mut().for_each(TokenWorkingMem::reset);
320            }
321        }
322        let buf = self.reader.consume_read_buffer();
323        if !buf.is_empty() {
324            let data: Vec<u8> = buf.into();
325            return Ok(ReadToken::EndOfData { data });
326        }
327        Ok(ReadToken::NotFound)
328    }
329
330    pub fn consume(&mut self, len: usize) {
331        self.reader.drain(..len);
332    }
333
334    pub fn take_back(self) -> Buffer<BufReader<T>> {
335        self.reader
336    }
337
338    pub fn skip_empty_data(&mut self) {
339        self.skip_empty_data = true;
340    }
341}
342impl<T: Read + Sized> Scanner<T, LineEnding> {
343    pub fn new_lf(input: T) -> Self {
344        Scanner {
345            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
346            tokens: vec![Token::new("\n", LineEnding::LineFeed)],
347            skip_empty_data: false,
348        }
349    }
350    pub fn new_crlf(input: T) -> Self {
351        Scanner {
352            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
353            tokens: vec![Token::new("\r\n", LineEnding::CarriageReturnLineFeed)],
354            skip_empty_data: false,
355        }
356    }
357    pub fn new_cr(input: T) -> Self {
358        Scanner {
359            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
360            tokens: vec![Token::new("\r", LineEnding::CarriageReturn)],
361            skip_empty_data: false,
362        }
363    }
364}
365#[derive(Debug, Copy, Clone, Eq, PartialEq)]
366pub enum WhitespaceCharacter {
367    Tab,
368    LineFeed,
369    VerticalTab,
370    FormFeed,
371    CarriageReturn,
372    Space,
373    NextLine,
374    NBSP,
375}
376impl<T: Read + Sized> Scanner<T, WhitespaceCharacter> {
377    pub fn new_whitespace(input: T) -> Self {
378        Scanner {
379            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
380            skip_empty_data: true,
381            tokens: vec![
382                Token::new(hex!("09"), WhitespaceCharacter::Tab),
383                Token::new(hex!("0A"), WhitespaceCharacter::LineFeed),
384                Token::new(hex!("0B"), WhitespaceCharacter::VerticalTab),
385                Token::new(hex!("0C"), WhitespaceCharacter::FormFeed),
386                Token::new(hex!("0D"), WhitespaceCharacter::CarriageReturn),
387                Token::new(hex!("20"), WhitespaceCharacter::Space),
388                Token::new(hex!("85"), WhitespaceCharacter::NextLine),
389                Token::new(hex!("A0"), WhitespaceCharacter::NBSP),
390            ],
391        }
392    }
393}
394
395#[derive(Debug, Copy, Clone, PartialEq, Eq)]
396pub enum LineEnding {
397    LineFeed,
398    CarriageReturnLineFeed,
399    CarriageReturn,
400}
401
402#[cfg(test)]
403mod tests {
404    use crate::scanner::*;
405
406    #[derive(Copy, Clone, Eq, PartialEq, Debug)]
407    enum Tokens {
408        Space,
409        Other,
410    }
411
412    #[test]
413    pub fn test_scan_until() -> Result<(), std::io::Error> {
414        let data = "this is a basic test\nthis is a second line";
415
416        let delims = &[Token::new(b" ", Tokens::Space)];
417
418        let mut scanner = Scanner::new(data.as_bytes(), delims);
419
420        for exp in [4, 2, 1, 5, 9, 2, 1, 6, 4] {
421            match scanner.scan_until_next()? {
422                FoundToken::Found { token, offset } => {
423                    assert_eq!(offset, exp);
424                    assert_eq!(token.response, Tokens::Space);
425                    assert_ne!(token.response, Tokens::Other);
426                }
427                FoundToken::EndOfData { remaining_length } => {
428                    assert_eq!(remaining_length, exp);
429                }
430                FoundToken::NotFound => {
431                    panic!("None not expected")
432                }
433            }
434            scanner.consume(exp);
435        }
436
437        Ok(())
438    }
439
440    #[test]
441    pub fn test_scan_escaped() -> Result<(), std::io::Error> {
442        let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
443
444        let delims = &[Token::new(b" ", Tokens::Space).with_escape_char(b'\\')];
445        let mut scanner = Scanner::new(data.as_bytes(), delims);
446
447        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
448            match scanner.scan_until_next()? {
449                FoundToken::Found { token, offset } => {
450                    assert_eq!(offset, exp);
451                    assert_eq!(token.response, Tokens::Space);
452                    assert_ne!(token.response, Tokens::Other);
453                }
454                FoundToken::EndOfData { .. } => {}
455                FoundToken::NotFound => {
456                    panic!("None not expected")
457                }
458            }
459            scanner.consume(exp);
460        }
461
462        Ok(())
463    }
464
465    #[test]
466    pub fn test_scan_quoted_double() -> Result<(), std::io::Error> {
467        let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
468        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::DoubleQuotes)];
469        let mut scanner = Scanner::new(data.as_bytes(), delims);
470
471        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
472            match scanner.scan_until_next()? {
473                FoundToken::Found { token, offset } => {
474                    assert_eq!(offset, exp);
475                    assert_eq!(token.response, Tokens::Space);
476                    assert_ne!(token.response, Tokens::Other);
477                }
478                FoundToken::EndOfData { .. } => {}
479                FoundToken::NotFound => {
480                    panic!("None not expected")
481                }
482            }
483            scanner.consume(exp);
484        }
485
486        Ok(())
487    }
488    #[test]
489    pub fn test_scan_quoted_single() -> Result<(), std::io::Error> {
490        let data = "this is a basic \'escaped\\ test\nthis\' is a second line";
491        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleQuotes)];
492
493        let mut scanner = Scanner::new(data.as_bytes(), delims);
494
495        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
496            match scanner.scan_until_next()? {
497                FoundToken::Found { token, offset } => {
498                    assert_eq!(offset, exp);
499                    assert_eq!(token.response, Tokens::Space);
500                    assert_ne!(token.response, Tokens::Other);
501                }
502                FoundToken::EndOfData { .. } => {}
503                FoundToken::NotFound => {
504                    panic!("None not expected")
505                }
506            }
507            scanner.consume(exp);
508        }
509
510        Ok(())
511    }
512
513    #[test]
514    pub fn test_scan_quoted_other() -> Result<(), std::io::Error> {
515        let data = "this is a basic |escaped\\ test\nthis| is a second line";
516
517        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::Other(b'|'))];
518        let mut scanner = Scanner::new(data.as_bytes(), delims);
519
520        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
521            match scanner.scan_until_next()? {
522                FoundToken::Found { token, offset } => {
523                    assert_eq!(offset, exp);
524                    assert_eq!(token.response, Tokens::Space);
525                    assert_ne!(token.response, Tokens::Other);
526                }
527                FoundToken::EndOfData { .. } => {}
528                FoundToken::NotFound => {
529                    panic!("None not expected")
530                }
531            }
532            scanner.consume(exp);
533        }
534
535        Ok(())
536    }
537
538    #[test]
539    pub fn test_scan_quoted_both() -> Result<(), std::io::Error> {
540        let data = "this is a \"more\' advanced\" \'escaped\\ \"test\nthis\' is a second line";
541        let delims =
542            &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleOrDoubleQuotes)];
543        let mut scanner = Scanner::new(data.as_bytes(), delims);
544
545        for exp in [4, 2, 1, 16, 21, 2, 1, 6, 4] {
546            match scanner.scan_until_next()? {
547                FoundToken::Found { token, offset } => {
548                    assert_eq!(offset, exp);
549                    assert_eq!(token.response, Tokens::Space);
550                    assert_ne!(token.response, Tokens::Other);
551                }
552                FoundToken::EndOfData { .. } => {}
553                FoundToken::NotFound => {
554                    panic!("None not expected")
555                }
556            }
557            scanner.consume(exp);
558        }
559
560        Ok(())
561    }
562
563    #[derive(Copy, Clone, Eq, PartialEq, Debug)]
564    enum CSVTokens {
565        Field,
566        Newline,
567    }
568    #[test]
569    pub fn test_scan_csv() -> Result<(), std::io::Error> {
570        let data = "name1,name2,name3,name4\r\nescaped\\,value1,\"quoted,value2\",\'quoted,value3\',\"long value\"\n\n";
571
572        let delims = &[
573            Token::new(b",", CSVTokens::Field)
574                .with_escape_char(b'\\')
575                .with_quote_char(QuotedChars::SingleOrDoubleQuotes),
576            Token::new(b"\r\n", CSVTokens::Newline),
577            Token::new(b"\n", CSVTokens::Newline),
578        ];
579        let mut scanner = Scanner::new(data.as_bytes(), delims);
580
581        let exp = &[
582            (5, CSVTokens::Field),
583            (5, CSVTokens::Field),
584            (5, CSVTokens::Field),
585            (5, CSVTokens::Newline),
586            (15, CSVTokens::Field),
587            (15, CSVTokens::Field),
588            (15, CSVTokens::Field),
589            (12, CSVTokens::Newline),
590            (0, CSVTokens::Newline),
591        ];
592
593        let mut ctr = 0;
594        for (exp_off, exp_ret) in exp {
595            let to_consume = match scanner.scan_until_next()? {
596                FoundToken::Found { token, offset } => {
597                    assert_eq!(offset, *exp_off, "{ctr}{:?}", token.response);
598                    assert_eq!(token.response, *exp_ret, "{ctr}");
599                    token.search.len()
600                }
601                FoundToken::EndOfData { .. } => {
602                    panic!("EOD Not expected {ctr}")
603                }
604                FoundToken::NotFound => {
605                    panic!("None not expected {ctr}")
606                }
607            };
608            let consumed = exp_off + to_consume;
609            scanner.consume(consumed);
610            ctr += 1;
611        }
612
613        Ok(())
614    }
615
616    #[test]
617    pub fn test_three_delim() -> Result<(), std::io::Error> {
618        let data = "this is a test of the testing test";
619        let mut scanner = Scanner::new(data.as_bytes(), &[Token::new("test", "test")]);
620        for (exp_off, exp) in &[(10, "test"), (8, "test"), (4, "test")] {
621            let to_consume = match scanner.scan_until_next()? {
622                FoundToken::Found { offset, token } => {
623                    assert_eq!(*exp_off, offset);
624                    assert_eq!(*exp, token.response);
625                    token.search.len()
626                }
627                FoundToken::EndOfData { remaining_length } => {
628                    assert_eq!(remaining_length, 0);
629                    remaining_length
630                }
631                FoundToken::NotFound => {
632                    panic!("Not found");
633                }
634            };
635            scanner.consume(exp_off + to_consume);
636        }
637        Ok(())
638    }
639}