irox_tools/util/
scanner.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2023 IROX Contributors
3
4//!
5//! A utility to scan for tokens in a byte stream
6//!
7
8extern crate alloc;
9use alloc::collections::VecDeque;
10use std::io::{BufReader, Read};
11
12use crate::read::Buffer;
13
14///
15/// What characters are considered "quotes"
16#[derive(Debug, Copy, Clone, Default)]
17pub enum QuotedChars {
18    /// Matches " or '
19    #[default]
20    SingleOrDoubleQuotes,
21    /// Matches only "
22    DoubleQuotes,
23    /// Matches only '
24    SingleQuotes,
25    /// Matches only the specified character
26    Other(u8),
27}
28
29///
30/// A token represents a searching string in the input data stream.  If the
31/// sequence of bytes `search` is found, then the response will be `response`.
32///
33/// Can optionally provide an `escape_char`, which will preclude `search` from
34/// matching if it is immediately preceded by that character.
35///
36/// Can optionally provide an `quote_char` to indicate that `search` should be
37/// precluded if wrapped in those characters.
38#[derive(Clone)]
39pub struct Token<T: Clone> {
40    search: Vec<u8>,
41    response: T,
42    escape_char: Option<u8>,
43    quote_char: Option<QuotedChars>,
44}
45
46impl<T: Clone> Token<T> {
47    pub fn new<S: AsRef<[u8]>>(search: S, response: T) -> Self {
48        Token {
49            search: search.as_ref().to_owned(),
50            response,
51            escape_char: None,
52            quote_char: None,
53        }
54    }
55    #[must_use]
56    pub fn with_escape_char(self, escape: u8) -> Self {
57        Token {
58            search: self.search,
59            response: self.response,
60            quote_char: self.quote_char,
61            escape_char: Some(escape),
62        }
63    }
64    #[must_use]
65    pub fn with_quote_char(self, quote_char: QuotedChars) -> Self {
66        Token {
67            search: self.search,
68            response: self.response,
69            quote_char: Some(quote_char),
70            escape_char: self.escape_char,
71        }
72    }
73
74    #[must_use]
75    pub fn get_search(&self) -> &[u8] {
76        self.search.as_ref()
77    }
78
79    #[must_use]
80    pub fn get_response(&self) -> &T {
81        &self.response
82    }
83}
84
85///
86/// Used as a return type to provide:
87/// `Found` if a token was found, which token, and where
88/// `EndOfData` the scanner hit the end of the buffer/EOF before finding the token
89/// `NotFound` if there is no more data in the buffer
90pub enum FoundToken<'a, T: Clone> {
91    Found { offset: usize, token: &'a Token<T> },
92    EndOfData { remaining_length: usize },
93    NotFound,
94}
95
96///
97/// Used as a return type to provide:
98/// `Found` if the token was found, which token, and the data preceding it
99/// `EndOfData` if the scanner hit EOF and found no token
100/// `NotFound` if there is no more data in the boffer
101pub enum ReadToken<'a, T: Clone> {
102    Found { data: Vec<u8>, token: &'a Token<T> },
103    EndOfData { data: Vec<u8> },
104    NotFound,
105}
106
107struct TokenWorkingMem<'a, T: Clone> {
108    token: &'a Token<T>,
109    ringbuf: VecDeque<u8>,
110    found_escape: bool,
111    last_found_quote_char: Option<u8>,
112    offset: usize,
113}
114impl<'a, T: Clone> TokenWorkingMem<'a, T> {
115    pub fn new(token: &'a Token<T>) -> Self {
116        TokenWorkingMem {
117            token,
118            ringbuf: VecDeque::with_capacity(token.search.len()),
119            found_escape: false,
120            last_found_quote_char: None,
121            offset: 0,
122        }
123    }
124
125    /// Is there any unfilled capacity?
126    pub fn is_full(&self) -> bool {
127        self.ringbuf.capacity() - self.ringbuf.len() == 0
128    }
129
130    /// Process the new element
131    pub fn push_back(&mut self, elem: u8) {
132        if !self.is_full() {
133            // it's not full yet, just push the new element in and skip out.
134            self.ringbuf.push_back(elem);
135            return;
136        }
137        // we are full, grab the front element, check it for escape and quotes
138        // and then append the new guy.
139        let ret = self.ringbuf.pop_front();
140
141        if let Some(first) = ret {
142            self.offset += 1;
143            if let Some(esc) = self.token.escape_char {
144                self.found_escape = first == esc;
145            }
146
147            if let Some(quoted) = self.token.quote_char {
148                //
149                // the following logic is a little bit complex, because
150                // SingleOrDoubleQuotes can match either ' or ", but we don't
151                // want to match the opposite character, IE, a ' won't terminate
152                // a " run.
153
154                if let Some(last_char) = self.last_found_quote_char {
155                    // if we're searching for the next quote character
156                    if last_char == first {
157                        // and we've found it - clear the flag
158                        self.last_found_quote_char = None;
159                    }
160                } else if match quoted {
161                    // if we're not searching, check against the possible quote
162                    // chars to see if we've found a start
163                    QuotedChars::SingleOrDoubleQuotes => first == b'\'' || first == b'\"',
164                    QuotedChars::DoubleQuotes => first == b'\"',
165                    QuotedChars::SingleQuotes => first == b'\'',
166                    QuotedChars::Other(o) => first == o,
167                } {
168                    // we've found a start, flag it.
169                    self.last_found_quote_char = Some(first);
170                }
171            }
172        }
173
174        self.ringbuf.push_back(elem);
175    }
176
177    /// Have we found the token?
178    pub fn matches(&self) -> bool {
179        if !self.is_full() {
180            return false;
181        }
182        if self.found_escape {
183            // escape character found, we'll never match the token.
184            return false;
185        }
186        if self.last_found_quote_char.is_some() {
187            // we're in a quoted sequence, we'll never match the token.
188            return false;
189        }
190        self.ringbuf.iter().eq(&self.token.search)
191    }
192}
193
194///
195/// A Scanner is a forward lookahead struct that scans through an stream of
196/// data looking for the indicated tokens.
197///
198/// The amount of possible forward lookahead is specified by the internal buffer
199/// size of the [`BufReader`]
200pub struct Scanner<T, R>
201where
202    T: Read + Sized,
203    R: Clone,
204{
205    reader: Buffer<BufReader<T>>,
206    tokens: Vec<Token<R>>,
207}
208
209impl<T: Read + Sized, R: Clone> Scanner<T, R> {
210    ///
211    /// Creates a scanner with the default buffer capacity, 8KB
212    pub fn new(input: T, delimiters: &[Token<R>]) -> Self {
213        Scanner {
214            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
215            tokens: Vec::from(delimiters),
216        }
217    }
218
219    ///
220    /// Creates a scanner with the specified buffer capacity
221    pub fn with_max_lookahead(input: T, max_buffer: usize, delimiters: &[Token<R>]) -> Self {
222        Scanner {
223            reader: Buffer::new(BufReader::with_capacity(max_buffer, input)),
224            tokens: Vec::from(delimiters),
225        }
226    }
227
228    ///
229    /// Scans through the buffer, looking for the specified token.  Returns the
230    /// number of bytes in the stream needed to position the cursor to JUST BEFORE
231    /// the token.  I.E., after calling `read_exact(scan_until())`, the next
232    /// call to `read()` will return the token itself.
233    ///
234    /// Returns `Ok(N)` if it found the token in the input stream, or hit the end of the buffer without finding the token
235    /// Returns `Ok(None)` if there are no additional characters to read in the buffer - we've hit EOF.
236    /// Returns `Err(e)` if there's an error reading from the underlying stream
237    pub fn scan_until_next(&mut self) -> Result<FoundToken<R>, std::io::Error> {
238        let mut workingmem: Vec<TokenWorkingMem<R>> =
239            self.tokens.iter().map(TokenWorkingMem::new).collect();
240        let mut num_read = 0;
241        for char in &mut self.reader {
242            for mem in &mut workingmem {
243                mem.push_back(char);
244
245                if mem.matches() {
246                    return Ok(FoundToken::Found {
247                        offset: mem.offset,
248                        token: mem.token,
249                    });
250                }
251            }
252
253            num_read += 1;
254        }
255        Ok(FoundToken::EndOfData {
256            remaining_length: num_read,
257        })
258    }
259
260    pub fn read_next(&mut self) -> Result<ReadToken<R>, std::io::Error> {
261        let mut workingmem: Vec<TokenWorkingMem<R>> =
262            self.tokens.iter().map(TokenWorkingMem::new).collect();
263        for char in &mut self.reader {
264            for mem in &mut workingmem {
265                mem.push_back(char);
266
267                if mem.matches() {
268                    let buf = self.reader.consume_read_buffer();
269                    let mut data: Vec<u8> = buf.into();
270                    data.truncate(mem.offset);
271                    return Ok(ReadToken::Found {
272                        data,
273                        token: mem.token,
274                    });
275                }
276            }
277        }
278        let buf = self.reader.consume_read_buffer();
279        if !buf.is_empty() {
280            let data: Vec<u8> = buf.into();
281            return Ok(ReadToken::EndOfData { data });
282        }
283        Ok(ReadToken::NotFound)
284    }
285
286    pub fn consume(&mut self, len: usize) {
287        self.reader.drain(..len);
288    }
289
290    pub fn take_back(self) -> Buffer<BufReader<T>> {
291        self.reader
292    }
293}
294impl<T: Read + Sized> Scanner<T, LineEnding> {
295    pub fn new_lf(input: T) -> Self {
296        Scanner {
297            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
298            tokens: vec![Token::new("\n", LineEnding::LineFeed)],
299        }
300    }
301    pub fn new_crlf(input: T) -> Self {
302        Scanner {
303            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
304            tokens: vec![Token::new("\r\n", LineEnding::CarriageReturnLineFeed)],
305        }
306    }
307    pub fn new_cr(input: T) -> Self {
308        Scanner {
309            reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
310            tokens: vec![Token::new("\r", LineEnding::CarriageReturn)],
311        }
312    }
313}
314
315#[derive(Debug, Copy, Clone, PartialEq, Eq)]
316pub enum LineEnding {
317    LineFeed,
318    CarriageReturnLineFeed,
319    CarriageReturn,
320}
321
322#[cfg(test)]
323mod tests {
324    use crate::scanner::*;
325
326    #[derive(Copy, Clone, Eq, PartialEq, Debug)]
327    enum Tokens {
328        Space,
329        Other,
330    }
331
332    #[test]
333    pub fn test_scan_until() -> Result<(), std::io::Error> {
334        let data = "this is a basic test\nthis is a second line";
335
336        let delims = &[Token::new(b" ", Tokens::Space)];
337
338        let mut scanner = Scanner::new(data.as_bytes(), delims);
339
340        for exp in [4, 2, 1, 5, 9, 2, 1, 6, 4] {
341            match scanner.scan_until_next()? {
342                FoundToken::Found { token, offset } => {
343                    assert_eq!(offset, exp);
344                    assert_eq!(token.response, Tokens::Space);
345                    assert_ne!(token.response, Tokens::Other);
346                }
347                FoundToken::EndOfData { remaining_length } => {
348                    assert_eq!(remaining_length, exp);
349                }
350                FoundToken::NotFound => {
351                    panic!("None not expected")
352                }
353            }
354            scanner.consume(exp);
355        }
356
357        Ok(())
358    }
359
360    #[test]
361    pub fn test_scan_escaped() -> Result<(), std::io::Error> {
362        let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
363
364        let delims = &[Token::new(b" ", Tokens::Space).with_escape_char(b'\\')];
365        let mut scanner = Scanner::new(data.as_bytes(), delims);
366
367        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
368            match scanner.scan_until_next()? {
369                FoundToken::Found { token, offset } => {
370                    assert_eq!(offset, exp);
371                    assert_eq!(token.response, Tokens::Space);
372                    assert_ne!(token.response, Tokens::Other);
373                }
374                FoundToken::EndOfData { .. } => {}
375                FoundToken::NotFound => {
376                    panic!("None not expected")
377                }
378            }
379            scanner.consume(exp);
380        }
381
382        Ok(())
383    }
384
385    #[test]
386    pub fn test_scan_quoted_double() -> Result<(), std::io::Error> {
387        let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
388        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::DoubleQuotes)];
389        let mut scanner = Scanner::new(data.as_bytes(), delims);
390
391        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
392            match scanner.scan_until_next()? {
393                FoundToken::Found { token, offset } => {
394                    assert_eq!(offset, exp);
395                    assert_eq!(token.response, Tokens::Space);
396                    assert_ne!(token.response, Tokens::Other);
397                }
398                FoundToken::EndOfData { .. } => {}
399                FoundToken::NotFound => {
400                    panic!("None not expected")
401                }
402            }
403            scanner.consume(exp);
404        }
405
406        Ok(())
407    }
408    #[test]
409    pub fn test_scan_quoted_single() -> Result<(), std::io::Error> {
410        let data = "this is a basic \'escaped\\ test\nthis\' is a second line";
411        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleQuotes)];
412
413        let mut scanner = Scanner::new(data.as_bytes(), delims);
414
415        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
416            match scanner.scan_until_next()? {
417                FoundToken::Found { token, offset } => {
418                    assert_eq!(offset, exp);
419                    assert_eq!(token.response, Tokens::Space);
420                    assert_ne!(token.response, Tokens::Other);
421                }
422                FoundToken::EndOfData { .. } => {}
423                FoundToken::NotFound => {
424                    panic!("None not expected")
425                }
426            }
427            scanner.consume(exp);
428        }
429
430        Ok(())
431    }
432
433    #[test]
434    pub fn test_scan_quoted_other() -> Result<(), std::io::Error> {
435        let data = "this is a basic |escaped\\ test\nthis| is a second line";
436
437        let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::Other(b'|'))];
438        let mut scanner = Scanner::new(data.as_bytes(), delims);
439
440        for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
441            match scanner.scan_until_next()? {
442                FoundToken::Found { token, offset } => {
443                    assert_eq!(offset, exp);
444                    assert_eq!(token.response, Tokens::Space);
445                    assert_ne!(token.response, Tokens::Other);
446                }
447                FoundToken::EndOfData { .. } => {}
448                FoundToken::NotFound => {
449                    panic!("None not expected")
450                }
451            }
452            scanner.consume(exp);
453        }
454
455        Ok(())
456    }
457
458    #[test]
459    pub fn test_scan_quoted_both() -> Result<(), std::io::Error> {
460        let data = "this is a \"more\' advanced\" \'escaped\\ \"test\nthis\' is a second line";
461        let delims =
462            &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleOrDoubleQuotes)];
463        let mut scanner = Scanner::new(data.as_bytes(), delims);
464
465        for exp in [4, 2, 1, 16, 21, 2, 1, 6, 4] {
466            match scanner.scan_until_next()? {
467                FoundToken::Found { token, offset } => {
468                    assert_eq!(offset, exp);
469                    assert_eq!(token.response, Tokens::Space);
470                    assert_ne!(token.response, Tokens::Other);
471                }
472                FoundToken::EndOfData { .. } => {}
473                FoundToken::NotFound => {
474                    panic!("None not expected")
475                }
476            }
477            scanner.consume(exp);
478        }
479
480        Ok(())
481    }
482
483    #[derive(Copy, Clone, Eq, PartialEq, Debug)]
484    enum CSVTokens {
485        Field,
486        Newline,
487    }
488    #[test]
489    pub fn test_scan_csv() -> Result<(), std::io::Error> {
490        let data = "name1,name2,name3,name4\r\nescaped\\,value1,\"quoted,value2\",\'quoted,value3\',\"long value\"\n\n";
491
492        let delims = &[
493            Token::new(b",", CSVTokens::Field)
494                .with_escape_char(b'\\')
495                .with_quote_char(QuotedChars::SingleOrDoubleQuotes),
496            Token::new(b"\r\n", CSVTokens::Newline),
497            Token::new(b"\n", CSVTokens::Newline),
498        ];
499        let mut scanner = Scanner::new(data.as_bytes(), delims);
500
501        let exp = &[
502            (5, CSVTokens::Field),
503            (5, CSVTokens::Field),
504            (5, CSVTokens::Field),
505            (5, CSVTokens::Newline),
506            (15, CSVTokens::Field),
507            (15, CSVTokens::Field),
508            (15, CSVTokens::Field),
509            (12, CSVTokens::Newline),
510            (0, CSVTokens::Newline),
511        ];
512
513        let mut ctr = 0;
514        for (exp_off, exp_ret) in exp {
515            let to_consume = match scanner.scan_until_next()? {
516                FoundToken::Found { token, offset } => {
517                    assert_eq!(offset, *exp_off, "{ctr}{:?}", token.response);
518                    assert_eq!(token.response, *exp_ret, "{ctr}");
519                    token.search.len()
520                }
521                FoundToken::EndOfData { .. } => {
522                    panic!("EOD Not expected {ctr}")
523                }
524                FoundToken::NotFound => {
525                    panic!("None not expected {ctr}")
526                }
527            };
528            let consumed = exp_off + to_consume;
529            scanner.consume(consumed);
530            ctr += 1;
531        }
532
533        Ok(())
534    }
535
536    #[test]
537    pub fn test_three_delim() -> Result<(), std::io::Error> {
538        let data = "this is a test of the testing test";
539        let mut scanner = Scanner::new(data.as_bytes(), &[Token::new("test", "test")]);
540        for (exp_off, exp) in &[(10, "test"), (8, "test"), (4, "test")] {
541            let to_consume = match scanner.scan_until_next()? {
542                FoundToken::Found { offset, token } => {
543                    assert_eq!(*exp_off, offset);
544                    assert_eq!(*exp, token.response);
545                    token.search.len()
546                }
547                FoundToken::EndOfData { remaining_length } => {
548                    assert_eq!(remaining_length, 0);
549                    remaining_length
550                }
551                FoundToken::NotFound => {
552                    panic!("Not found");
553                }
554            };
555            scanner.consume(exp_off + to_consume);
556        }
557        Ok(())
558    }
559}