1extern crate alloc;
10
11use crate::hex;
12use crate::read::Buffer;
13use alloc::borrow::Cow;
14use alloc::collections::VecDeque;
15use std::io::{BufReader, Read};
16
17#[derive(Debug, Copy, Clone, Default)]
20pub enum QuotedChars {
21 #[default]
23 SingleOrDoubleQuotes,
24 DoubleQuotes,
26 SingleQuotes,
28 Other(u8),
30}
31
32#[derive(Clone)]
42pub struct Token<T: Clone> {
43 search: Vec<u8>,
44 response: T,
45 escape_char: Option<u8>,
46 quote_char: Option<QuotedChars>,
47}
48
49impl<T: Clone> Token<T> {
50 pub fn new<S: AsRef<[u8]>>(search: S, response: T) -> Self {
51 Token {
52 search: search.as_ref().to_owned(),
53 response,
54 escape_char: None,
55 quote_char: None,
56 }
57 }
58 #[must_use]
59 pub fn with_escape_char(self, escape: u8) -> Self {
60 Token {
61 search: self.search,
62 response: self.response,
63 quote_char: self.quote_char,
64 escape_char: Some(escape),
65 }
66 }
67 #[must_use]
68 pub fn with_quote_char(self, quote_char: QuotedChars) -> Self {
69 Token {
70 search: self.search,
71 response: self.response,
72 quote_char: Some(quote_char),
73 escape_char: self.escape_char,
74 }
75 }
76
77 #[must_use]
78 pub fn get_search(&self) -> &[u8] {
79 self.search.as_ref()
80 }
81
82 #[must_use]
83 pub fn get_response(&self) -> &T {
84 &self.response
85 }
86}
87
88pub enum FoundToken<'a, T: Clone> {
94 Found { offset: usize, token: &'a Token<T> },
95 EndOfData { remaining_length: usize },
96 NotFound,
97}
98
99pub enum ReadToken<'a, T: Clone> {
105 Found { data: Vec<u8>, token: &'a Token<T> },
106 EndOfData { data: Vec<u8> },
107 NotFound,
108}
109impl<T: Clone> ReadToken<'_, T> {
110 pub fn get_data(self) -> Option<Vec<u8>> {
111 match self {
112 ReadToken::Found { data, .. } | ReadToken::EndOfData { data, .. } => Some(data),
113 ReadToken::NotFound => None,
114 }
115 }
116 pub fn as_str(&self) -> Cow<'_, str> {
117 match self {
118 ReadToken::Found { data, .. } | ReadToken::EndOfData { data, .. } => {
119 String::from_utf8_lossy(data.as_slice())
120 }
121 ReadToken::NotFound => Default::default(),
122 }
123 }
124}
125
126struct TokenWorkingMem<'a, T: Clone> {
127 token: &'a Token<T>,
128 ringbuf: VecDeque<u8>,
129 found_escape: bool,
130 last_found_quote_char: Option<u8>,
131 offset: usize,
132}
133impl<'a, T: Clone> TokenWorkingMem<'a, T> {
134 pub fn new(token: &'a Token<T>) -> Self {
135 TokenWorkingMem {
136 token,
137 ringbuf: VecDeque::with_capacity(token.search.len()),
138 found_escape: false,
139 last_found_quote_char: None,
140 offset: 0,
141 }
142 }
143 pub fn reset(&mut self) {
144 self.ringbuf.clear();
145 self.found_escape = false;
146 self.last_found_quote_char = None;
147 self.offset = 0;
148 }
149 pub fn is_full(&self) -> bool {
151 self.ringbuf.capacity() - self.ringbuf.len() == 0
152 }
153
154 pub fn push_back(&mut self, elem: u8) {
156 if !self.is_full() {
157 self.ringbuf.push_back(elem);
159 return;
160 }
161 let ret = self.ringbuf.pop_front();
164
165 if let Some(first) = ret {
166 self.offset += 1;
167 if let Some(esc) = self.token.escape_char {
168 self.found_escape = first == esc;
169 }
170
171 if let Some(quoted) = self.token.quote_char {
172 if let Some(last_char) = self.last_found_quote_char {
179 if last_char == first {
181 self.last_found_quote_char = None;
183 }
184 } else if match quoted {
185 QuotedChars::SingleOrDoubleQuotes => first == b'\'' || first == b'\"',
188 QuotedChars::DoubleQuotes => first == b'\"',
189 QuotedChars::SingleQuotes => first == b'\'',
190 QuotedChars::Other(o) => first == o,
191 } {
192 self.last_found_quote_char = Some(first);
194 }
195 }
196 }
197
198 self.ringbuf.push_back(elem);
199 }
200
201 pub fn matches(&self) -> bool {
203 if !self.is_full() {
204 return false;
205 }
206 if self.found_escape {
207 return false;
209 }
210 if self.last_found_quote_char.is_some() {
211 return false;
213 }
214 self.ringbuf.iter().eq(&self.token.search)
215 }
216}
217
218pub struct Scanner<T, R>
225where
226 T: Read + Sized,
227 R: Clone,
228{
229 reader: Buffer<BufReader<T>>,
230 tokens: Vec<Token<R>>,
231 skip_empty_data: bool,
232}
233
234impl<T: Read + Sized, R: Clone> Scanner<T, R> {
235 pub fn new(input: T, delimiters: &[Token<R>]) -> Self {
238 Scanner {
239 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
240 tokens: Vec::from(delimiters),
241 skip_empty_data: false,
242 }
243 }
244
245 pub fn with_max_lookahead(input: T, max_buffer: usize, delimiters: &[Token<R>]) -> Self {
248 Scanner {
249 reader: Buffer::new(BufReader::with_capacity(max_buffer, input)),
250 tokens: Vec::from(delimiters),
251 skip_empty_data: false,
252 }
253 }
254
255 pub fn scan_until_next(&mut self) -> Result<FoundToken<'_, R>, std::io::Error> {
265 let mut workingmem: Vec<TokenWorkingMem<R>> =
266 self.tokens.iter().map(TokenWorkingMem::new).collect();
267 let mut num_read = 0;
268 while let Some(char) = self.reader.next() {
269 let mut reset = false;
270 for mem in &mut workingmem {
271 mem.push_back(char);
272
273 if mem.matches() {
274 if self.skip_empty_data && (self.reader.is_empty() || mem.offset == 0) {
275 reset = true;
276 self.reader.consume_read_buffer();
277 break;
278 }
279 return Ok(FoundToken::Found {
280 offset: mem.offset,
281 token: mem.token,
282 });
283 }
284 }
285 if reset {
286 workingmem.iter_mut().for_each(TokenWorkingMem::reset);
287 }
288 num_read += 1;
289 }
290 Ok(FoundToken::EndOfData {
291 remaining_length: num_read,
292 })
293 }
294
295 pub fn read_next(&mut self) -> Result<ReadToken<'_, R>, std::io::Error> {
296 let mut workingmem: Vec<TokenWorkingMem<R>> =
297 self.tokens.iter().map(TokenWorkingMem::new).collect();
298
299 while let Some(char) = self.reader.next() {
300 let mut reset = false;
301 for mem in &mut workingmem {
302 mem.push_back(char);
303 if mem.matches() {
304 if self.skip_empty_data && (self.reader.is_empty() || mem.offset == 0) {
305 reset = true;
306 self.reader.consume_read_buffer();
307 break;
308 }
309 let buf = self.reader.consume_read_buffer();
310 let mut data: Vec<u8> = buf.into();
311 data.truncate(mem.offset);
312 return Ok(ReadToken::Found {
313 data,
314 token: mem.token,
315 });
316 }
317 }
318 if reset {
319 workingmem.iter_mut().for_each(TokenWorkingMem::reset);
320 }
321 }
322 let buf = self.reader.consume_read_buffer();
323 if !buf.is_empty() {
324 let data: Vec<u8> = buf.into();
325 return Ok(ReadToken::EndOfData { data });
326 }
327 Ok(ReadToken::NotFound)
328 }
329
330 pub fn consume(&mut self, len: usize) {
331 self.reader.drain(..len);
332 }
333
334 pub fn take_back(self) -> Buffer<BufReader<T>> {
335 self.reader
336 }
337
338 pub fn skip_empty_data(&mut self) {
339 self.skip_empty_data = true;
340 }
341}
342impl<T: Read + Sized> Scanner<T, LineEnding> {
343 pub fn new_lf(input: T) -> Self {
344 Scanner {
345 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
346 tokens: vec![Token::new("\n", LineEnding::LineFeed)],
347 skip_empty_data: false,
348 }
349 }
350 pub fn new_crlf(input: T) -> Self {
351 Scanner {
352 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
353 tokens: vec![Token::new("\r\n", LineEnding::CarriageReturnLineFeed)],
354 skip_empty_data: false,
355 }
356 }
357 pub fn new_cr(input: T) -> Self {
358 Scanner {
359 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
360 tokens: vec![Token::new("\r", LineEnding::CarriageReturn)],
361 skip_empty_data: false,
362 }
363 }
364}
365#[derive(Debug, Copy, Clone, Eq, PartialEq)]
366pub enum WhitespaceCharacter {
367 Tab,
368 LineFeed,
369 VerticalTab,
370 FormFeed,
371 CarriageReturn,
372 Space,
373 NextLine,
374 NBSP,
375}
376impl<T: Read + Sized> Scanner<T, WhitespaceCharacter> {
377 pub fn new_whitespace(input: T) -> Self {
378 Scanner {
379 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
380 skip_empty_data: true,
381 tokens: vec![
382 Token::new(hex!("09"), WhitespaceCharacter::Tab),
383 Token::new(hex!("0A"), WhitespaceCharacter::LineFeed),
384 Token::new(hex!("0B"), WhitespaceCharacter::VerticalTab),
385 Token::new(hex!("0C"), WhitespaceCharacter::FormFeed),
386 Token::new(hex!("0D"), WhitespaceCharacter::CarriageReturn),
387 Token::new(hex!("20"), WhitespaceCharacter::Space),
388 Token::new(hex!("85"), WhitespaceCharacter::NextLine),
389 Token::new(hex!("A0"), WhitespaceCharacter::NBSP),
390 ],
391 }
392 }
393}
394
395#[derive(Debug, Copy, Clone, PartialEq, Eq)]
396pub enum LineEnding {
397 LineFeed,
398 CarriageReturnLineFeed,
399 CarriageReturn,
400}
401
402#[cfg(test)]
403mod tests {
404 use crate::scanner::*;
405
406 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
407 enum Tokens {
408 Space,
409 Other,
410 }
411
412 #[test]
413 pub fn test_scan_until() -> Result<(), std::io::Error> {
414 let data = "this is a basic test\nthis is a second line";
415
416 let delims = &[Token::new(b" ", Tokens::Space)];
417
418 let mut scanner = Scanner::new(data.as_bytes(), delims);
419
420 for exp in [4, 2, 1, 5, 9, 2, 1, 6, 4] {
421 match scanner.scan_until_next()? {
422 FoundToken::Found { token, offset } => {
423 assert_eq!(offset, exp);
424 assert_eq!(token.response, Tokens::Space);
425 assert_ne!(token.response, Tokens::Other);
426 }
427 FoundToken::EndOfData { remaining_length } => {
428 assert_eq!(remaining_length, exp);
429 }
430 FoundToken::NotFound => {
431 panic!("None not expected")
432 }
433 }
434 scanner.consume(exp);
435 }
436
437 Ok(())
438 }
439
440 #[test]
441 pub fn test_scan_escaped() -> Result<(), std::io::Error> {
442 let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
443
444 let delims = &[Token::new(b" ", Tokens::Space).with_escape_char(b'\\')];
445 let mut scanner = Scanner::new(data.as_bytes(), delims);
446
447 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
448 match scanner.scan_until_next()? {
449 FoundToken::Found { token, offset } => {
450 assert_eq!(offset, exp);
451 assert_eq!(token.response, Tokens::Space);
452 assert_ne!(token.response, Tokens::Other);
453 }
454 FoundToken::EndOfData { .. } => {}
455 FoundToken::NotFound => {
456 panic!("None not expected")
457 }
458 }
459 scanner.consume(exp);
460 }
461
462 Ok(())
463 }
464
465 #[test]
466 pub fn test_scan_quoted_double() -> Result<(), std::io::Error> {
467 let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
468 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::DoubleQuotes)];
469 let mut scanner = Scanner::new(data.as_bytes(), delims);
470
471 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
472 match scanner.scan_until_next()? {
473 FoundToken::Found { token, offset } => {
474 assert_eq!(offset, exp);
475 assert_eq!(token.response, Tokens::Space);
476 assert_ne!(token.response, Tokens::Other);
477 }
478 FoundToken::EndOfData { .. } => {}
479 FoundToken::NotFound => {
480 panic!("None not expected")
481 }
482 }
483 scanner.consume(exp);
484 }
485
486 Ok(())
487 }
488 #[test]
489 pub fn test_scan_quoted_single() -> Result<(), std::io::Error> {
490 let data = "this is a basic \'escaped\\ test\nthis\' is a second line";
491 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleQuotes)];
492
493 let mut scanner = Scanner::new(data.as_bytes(), delims);
494
495 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
496 match scanner.scan_until_next()? {
497 FoundToken::Found { token, offset } => {
498 assert_eq!(offset, exp);
499 assert_eq!(token.response, Tokens::Space);
500 assert_ne!(token.response, Tokens::Other);
501 }
502 FoundToken::EndOfData { .. } => {}
503 FoundToken::NotFound => {
504 panic!("None not expected")
505 }
506 }
507 scanner.consume(exp);
508 }
509
510 Ok(())
511 }
512
513 #[test]
514 pub fn test_scan_quoted_other() -> Result<(), std::io::Error> {
515 let data = "this is a basic |escaped\\ test\nthis| is a second line";
516
517 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::Other(b'|'))];
518 let mut scanner = Scanner::new(data.as_bytes(), delims);
519
520 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
521 match scanner.scan_until_next()? {
522 FoundToken::Found { token, offset } => {
523 assert_eq!(offset, exp);
524 assert_eq!(token.response, Tokens::Space);
525 assert_ne!(token.response, Tokens::Other);
526 }
527 FoundToken::EndOfData { .. } => {}
528 FoundToken::NotFound => {
529 panic!("None not expected")
530 }
531 }
532 scanner.consume(exp);
533 }
534
535 Ok(())
536 }
537
538 #[test]
539 pub fn test_scan_quoted_both() -> Result<(), std::io::Error> {
540 let data = "this is a \"more\' advanced\" \'escaped\\ \"test\nthis\' is a second line";
541 let delims =
542 &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleOrDoubleQuotes)];
543 let mut scanner = Scanner::new(data.as_bytes(), delims);
544
545 for exp in [4, 2, 1, 16, 21, 2, 1, 6, 4] {
546 match scanner.scan_until_next()? {
547 FoundToken::Found { token, offset } => {
548 assert_eq!(offset, exp);
549 assert_eq!(token.response, Tokens::Space);
550 assert_ne!(token.response, Tokens::Other);
551 }
552 FoundToken::EndOfData { .. } => {}
553 FoundToken::NotFound => {
554 panic!("None not expected")
555 }
556 }
557 scanner.consume(exp);
558 }
559
560 Ok(())
561 }
562
563 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
564 enum CSVTokens {
565 Field,
566 Newline,
567 }
568 #[test]
569 pub fn test_scan_csv() -> Result<(), std::io::Error> {
570 let data = "name1,name2,name3,name4\r\nescaped\\,value1,\"quoted,value2\",\'quoted,value3\',\"long value\"\n\n";
571
572 let delims = &[
573 Token::new(b",", CSVTokens::Field)
574 .with_escape_char(b'\\')
575 .with_quote_char(QuotedChars::SingleOrDoubleQuotes),
576 Token::new(b"\r\n", CSVTokens::Newline),
577 Token::new(b"\n", CSVTokens::Newline),
578 ];
579 let mut scanner = Scanner::new(data.as_bytes(), delims);
580
581 let exp = &[
582 (5, CSVTokens::Field),
583 (5, CSVTokens::Field),
584 (5, CSVTokens::Field),
585 (5, CSVTokens::Newline),
586 (15, CSVTokens::Field),
587 (15, CSVTokens::Field),
588 (15, CSVTokens::Field),
589 (12, CSVTokens::Newline),
590 (0, CSVTokens::Newline),
591 ];
592
593 let mut ctr = 0;
594 for (exp_off, exp_ret) in exp {
595 let to_consume = match scanner.scan_until_next()? {
596 FoundToken::Found { token, offset } => {
597 assert_eq!(offset, *exp_off, "{ctr}{:?}", token.response);
598 assert_eq!(token.response, *exp_ret, "{ctr}");
599 token.search.len()
600 }
601 FoundToken::EndOfData { .. } => {
602 panic!("EOD Not expected {ctr}")
603 }
604 FoundToken::NotFound => {
605 panic!("None not expected {ctr}")
606 }
607 };
608 let consumed = exp_off + to_consume;
609 scanner.consume(consumed);
610 ctr += 1;
611 }
612
613 Ok(())
614 }
615
616 #[test]
617 pub fn test_three_delim() -> Result<(), std::io::Error> {
618 let data = "this is a test of the testing test";
619 let mut scanner = Scanner::new(data.as_bytes(), &[Token::new("test", "test")]);
620 for (exp_off, exp) in &[(10, "test"), (8, "test"), (4, "test")] {
621 let to_consume = match scanner.scan_until_next()? {
622 FoundToken::Found { offset, token } => {
623 assert_eq!(*exp_off, offset);
624 assert_eq!(*exp, token.response);
625 token.search.len()
626 }
627 FoundToken::EndOfData { remaining_length } => {
628 assert_eq!(remaining_length, 0);
629 remaining_length
630 }
631 FoundToken::NotFound => {
632 panic!("Not found");
633 }
634 };
635 scanner.consume(exp_off + to_consume);
636 }
637 Ok(())
638 }
639}