1extern crate alloc;
9use alloc::collections::VecDeque;
10use std::io::{BufReader, Read};
11
12use crate::read::Buffer;
13
14#[derive(Debug, Copy, Clone, Default)]
17pub enum QuotedChars {
18 #[default]
20 SingleOrDoubleQuotes,
21 DoubleQuotes,
23 SingleQuotes,
25 Other(u8),
27}
28
29#[derive(Clone)]
39pub struct Token<T: Clone> {
40 search: Vec<u8>,
41 response: T,
42 escape_char: Option<u8>,
43 quote_char: Option<QuotedChars>,
44}
45
46impl<T: Clone> Token<T> {
47 pub fn new<S: AsRef<[u8]>>(search: S, response: T) -> Self {
48 Token {
49 search: search.as_ref().to_owned(),
50 response,
51 escape_char: None,
52 quote_char: None,
53 }
54 }
55 #[must_use]
56 pub fn with_escape_char(self, escape: u8) -> Self {
57 Token {
58 search: self.search,
59 response: self.response,
60 quote_char: self.quote_char,
61 escape_char: Some(escape),
62 }
63 }
64 #[must_use]
65 pub fn with_quote_char(self, quote_char: QuotedChars) -> Self {
66 Token {
67 search: self.search,
68 response: self.response,
69 quote_char: Some(quote_char),
70 escape_char: self.escape_char,
71 }
72 }
73
74 #[must_use]
75 pub fn get_search(&self) -> &[u8] {
76 self.search.as_ref()
77 }
78
79 #[must_use]
80 pub fn get_response(&self) -> &T {
81 &self.response
82 }
83}
84
85pub enum FoundToken<'a, T: Clone> {
91 Found { offset: usize, token: &'a Token<T> },
92 EndOfData { remaining_length: usize },
93 NotFound,
94}
95
96pub enum ReadToken<'a, T: Clone> {
102 Found { data: Vec<u8>, token: &'a Token<T> },
103 EndOfData { data: Vec<u8> },
104 NotFound,
105}
106
107struct TokenWorkingMem<'a, T: Clone> {
108 token: &'a Token<T>,
109 ringbuf: VecDeque<u8>,
110 found_escape: bool,
111 last_found_quote_char: Option<u8>,
112 offset: usize,
113}
114impl<'a, T: Clone> TokenWorkingMem<'a, T> {
115 pub fn new(token: &'a Token<T>) -> Self {
116 TokenWorkingMem {
117 token,
118 ringbuf: VecDeque::with_capacity(token.search.len()),
119 found_escape: false,
120 last_found_quote_char: None,
121 offset: 0,
122 }
123 }
124
125 pub fn is_full(&self) -> bool {
127 self.ringbuf.capacity() - self.ringbuf.len() == 0
128 }
129
130 pub fn push_back(&mut self, elem: u8) {
132 if !self.is_full() {
133 self.ringbuf.push_back(elem);
135 return;
136 }
137 let ret = self.ringbuf.pop_front();
140
141 if let Some(first) = ret {
142 self.offset += 1;
143 if let Some(esc) = self.token.escape_char {
144 self.found_escape = first == esc;
145 }
146
147 if let Some(quoted) = self.token.quote_char {
148 if let Some(last_char) = self.last_found_quote_char {
155 if last_char == first {
157 self.last_found_quote_char = None;
159 }
160 } else if match quoted {
161 QuotedChars::SingleOrDoubleQuotes => first == b'\'' || first == b'\"',
164 QuotedChars::DoubleQuotes => first == b'\"',
165 QuotedChars::SingleQuotes => first == b'\'',
166 QuotedChars::Other(o) => first == o,
167 } {
168 self.last_found_quote_char = Some(first);
170 }
171 }
172 }
173
174 self.ringbuf.push_back(elem);
175 }
176
177 pub fn matches(&self) -> bool {
179 if !self.is_full() {
180 return false;
181 }
182 if self.found_escape {
183 return false;
185 }
186 if self.last_found_quote_char.is_some() {
187 return false;
189 }
190 self.ringbuf.iter().eq(&self.token.search)
191 }
192}
193
194pub struct Scanner<T, R>
201where
202 T: Read + Sized,
203 R: Clone,
204{
205 reader: Buffer<BufReader<T>>,
206 tokens: Vec<Token<R>>,
207}
208
209impl<T: Read + Sized, R: Clone> Scanner<T, R> {
210 pub fn new(input: T, delimiters: &[Token<R>]) -> Self {
213 Scanner {
214 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
215 tokens: Vec::from(delimiters),
216 }
217 }
218
219 pub fn with_max_lookahead(input: T, max_buffer: usize, delimiters: &[Token<R>]) -> Self {
222 Scanner {
223 reader: Buffer::new(BufReader::with_capacity(max_buffer, input)),
224 tokens: Vec::from(delimiters),
225 }
226 }
227
228 pub fn scan_until_next(&mut self) -> Result<FoundToken<R>, std::io::Error> {
238 let mut workingmem: Vec<TokenWorkingMem<R>> =
239 self.tokens.iter().map(TokenWorkingMem::new).collect();
240 let mut num_read = 0;
241 for char in &mut self.reader {
242 for mem in &mut workingmem {
243 mem.push_back(char);
244
245 if mem.matches() {
246 return Ok(FoundToken::Found {
247 offset: mem.offset,
248 token: mem.token,
249 });
250 }
251 }
252
253 num_read += 1;
254 }
255 Ok(FoundToken::EndOfData {
256 remaining_length: num_read,
257 })
258 }
259
260 pub fn read_next(&mut self) -> Result<ReadToken<R>, std::io::Error> {
261 let mut workingmem: Vec<TokenWorkingMem<R>> =
262 self.tokens.iter().map(TokenWorkingMem::new).collect();
263 for char in &mut self.reader {
264 for mem in &mut workingmem {
265 mem.push_back(char);
266
267 if mem.matches() {
268 let buf = self.reader.consume_read_buffer();
269 let mut data: Vec<u8> = buf.into();
270 data.truncate(mem.offset);
271 return Ok(ReadToken::Found {
272 data,
273 token: mem.token,
274 });
275 }
276 }
277 }
278 let buf = self.reader.consume_read_buffer();
279 if !buf.is_empty() {
280 let data: Vec<u8> = buf.into();
281 return Ok(ReadToken::EndOfData { data });
282 }
283 Ok(ReadToken::NotFound)
284 }
285
286 pub fn consume(&mut self, len: usize) {
287 self.reader.drain(..len);
288 }
289
290 pub fn take_back(self) -> Buffer<BufReader<T>> {
291 self.reader
292 }
293}
294impl<T: Read + Sized> Scanner<T, LineEnding> {
295 pub fn new_lf(input: T) -> Self {
296 Scanner {
297 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
298 tokens: vec![Token::new("\n", LineEnding::LineFeed)],
299 }
300 }
301 pub fn new_crlf(input: T) -> Self {
302 Scanner {
303 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
304 tokens: vec![Token::new("\r\n", LineEnding::CarriageReturnLineFeed)],
305 }
306 }
307 pub fn new_cr(input: T) -> Self {
308 Scanner {
309 reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
310 tokens: vec![Token::new("\r", LineEnding::CarriageReturn)],
311 }
312 }
313}
314
315#[derive(Debug, Copy, Clone, PartialEq, Eq)]
316pub enum LineEnding {
317 LineFeed,
318 CarriageReturnLineFeed,
319 CarriageReturn,
320}
321
322#[cfg(test)]
323mod tests {
324 use crate::scanner::*;
325
326 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
327 enum Tokens {
328 Space,
329 Other,
330 }
331
332 #[test]
333 pub fn test_scan_until() -> Result<(), std::io::Error> {
334 let data = "this is a basic test\nthis is a second line";
335
336 let delims = &[Token::new(b" ", Tokens::Space)];
337
338 let mut scanner = Scanner::new(data.as_bytes(), delims);
339
340 for exp in [4, 2, 1, 5, 9, 2, 1, 6, 4] {
341 match scanner.scan_until_next()? {
342 FoundToken::Found { token, offset } => {
343 assert_eq!(offset, exp);
344 assert_eq!(token.response, Tokens::Space);
345 assert_ne!(token.response, Tokens::Other);
346 }
347 FoundToken::EndOfData { remaining_length } => {
348 assert_eq!(remaining_length, exp);
349 }
350 FoundToken::NotFound => {
351 panic!("None not expected")
352 }
353 }
354 scanner.consume(exp);
355 }
356
357 Ok(())
358 }
359
360 #[test]
361 pub fn test_scan_escaped() -> Result<(), std::io::Error> {
362 let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
363
364 let delims = &[Token::new(b" ", Tokens::Space).with_escape_char(b'\\')];
365 let mut scanner = Scanner::new(data.as_bytes(), delims);
366
367 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
368 match scanner.scan_until_next()? {
369 FoundToken::Found { token, offset } => {
370 assert_eq!(offset, exp);
371 assert_eq!(token.response, Tokens::Space);
372 assert_ne!(token.response, Tokens::Other);
373 }
374 FoundToken::EndOfData { .. } => {}
375 FoundToken::NotFound => {
376 panic!("None not expected")
377 }
378 }
379 scanner.consume(exp);
380 }
381
382 Ok(())
383 }
384
385 #[test]
386 pub fn test_scan_quoted_double() -> Result<(), std::io::Error> {
387 let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
388 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::DoubleQuotes)];
389 let mut scanner = Scanner::new(data.as_bytes(), delims);
390
391 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
392 match scanner.scan_until_next()? {
393 FoundToken::Found { token, offset } => {
394 assert_eq!(offset, exp);
395 assert_eq!(token.response, Tokens::Space);
396 assert_ne!(token.response, Tokens::Other);
397 }
398 FoundToken::EndOfData { .. } => {}
399 FoundToken::NotFound => {
400 panic!("None not expected")
401 }
402 }
403 scanner.consume(exp);
404 }
405
406 Ok(())
407 }
408 #[test]
409 pub fn test_scan_quoted_single() -> Result<(), std::io::Error> {
410 let data = "this is a basic \'escaped\\ test\nthis\' is a second line";
411 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleQuotes)];
412
413 let mut scanner = Scanner::new(data.as_bytes(), delims);
414
415 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
416 match scanner.scan_until_next()? {
417 FoundToken::Found { token, offset } => {
418 assert_eq!(offset, exp);
419 assert_eq!(token.response, Tokens::Space);
420 assert_ne!(token.response, Tokens::Other);
421 }
422 FoundToken::EndOfData { .. } => {}
423 FoundToken::NotFound => {
424 panic!("None not expected")
425 }
426 }
427 scanner.consume(exp);
428 }
429
430 Ok(())
431 }
432
433 #[test]
434 pub fn test_scan_quoted_other() -> Result<(), std::io::Error> {
435 let data = "this is a basic |escaped\\ test\nthis| is a second line";
436
437 let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::Other(b'|'))];
438 let mut scanner = Scanner::new(data.as_bytes(), delims);
439
440 for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
441 match scanner.scan_until_next()? {
442 FoundToken::Found { token, offset } => {
443 assert_eq!(offset, exp);
444 assert_eq!(token.response, Tokens::Space);
445 assert_ne!(token.response, Tokens::Other);
446 }
447 FoundToken::EndOfData { .. } => {}
448 FoundToken::NotFound => {
449 panic!("None not expected")
450 }
451 }
452 scanner.consume(exp);
453 }
454
455 Ok(())
456 }
457
458 #[test]
459 pub fn test_scan_quoted_both() -> Result<(), std::io::Error> {
460 let data = "this is a \"more\' advanced\" \'escaped\\ \"test\nthis\' is a second line";
461 let delims =
462 &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleOrDoubleQuotes)];
463 let mut scanner = Scanner::new(data.as_bytes(), delims);
464
465 for exp in [4, 2, 1, 16, 21, 2, 1, 6, 4] {
466 match scanner.scan_until_next()? {
467 FoundToken::Found { token, offset } => {
468 assert_eq!(offset, exp);
469 assert_eq!(token.response, Tokens::Space);
470 assert_ne!(token.response, Tokens::Other);
471 }
472 FoundToken::EndOfData { .. } => {}
473 FoundToken::NotFound => {
474 panic!("None not expected")
475 }
476 }
477 scanner.consume(exp);
478 }
479
480 Ok(())
481 }
482
483 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
484 enum CSVTokens {
485 Field,
486 Newline,
487 }
488 #[test]
489 pub fn test_scan_csv() -> Result<(), std::io::Error> {
490 let data = "name1,name2,name3,name4\r\nescaped\\,value1,\"quoted,value2\",\'quoted,value3\',\"long value\"\n\n";
491
492 let delims = &[
493 Token::new(b",", CSVTokens::Field)
494 .with_escape_char(b'\\')
495 .with_quote_char(QuotedChars::SingleOrDoubleQuotes),
496 Token::new(b"\r\n", CSVTokens::Newline),
497 Token::new(b"\n", CSVTokens::Newline),
498 ];
499 let mut scanner = Scanner::new(data.as_bytes(), delims);
500
501 let exp = &[
502 (5, CSVTokens::Field),
503 (5, CSVTokens::Field),
504 (5, CSVTokens::Field),
505 (5, CSVTokens::Newline),
506 (15, CSVTokens::Field),
507 (15, CSVTokens::Field),
508 (15, CSVTokens::Field),
509 (12, CSVTokens::Newline),
510 (0, CSVTokens::Newline),
511 ];
512
513 let mut ctr = 0;
514 for (exp_off, exp_ret) in exp {
515 let to_consume = match scanner.scan_until_next()? {
516 FoundToken::Found { token, offset } => {
517 assert_eq!(offset, *exp_off, "{ctr}{:?}", token.response);
518 assert_eq!(token.response, *exp_ret, "{ctr}");
519 token.search.len()
520 }
521 FoundToken::EndOfData { .. } => {
522 panic!("EOD Not expected {ctr}")
523 }
524 FoundToken::NotFound => {
525 panic!("None not expected {ctr}")
526 }
527 };
528 let consumed = exp_off + to_consume;
529 scanner.consume(consumed);
530 ctr += 1;
531 }
532
533 Ok(())
534 }
535
536 #[test]
537 pub fn test_three_delim() -> Result<(), std::io::Error> {
538 let data = "this is a test of the testing test";
539 let mut scanner = Scanner::new(data.as_bytes(), &[Token::new("test", "test")]);
540 for (exp_off, exp) in &[(10, "test"), (8, "test"), (4, "test")] {
541 let to_consume = match scanner.scan_until_next()? {
542 FoundToken::Found { offset, token } => {
543 assert_eq!(*exp_off, offset);
544 assert_eq!(*exp, token.response);
545 token.search.len()
546 }
547 FoundToken::EndOfData { remaining_length } => {
548 assert_eq!(remaining_length, 0);
549 remaining_length
550 }
551 FoundToken::NotFound => {
552 panic!("Not found");
553 }
554 };
555 scanner.consume(exp_off + to_consume);
556 }
557 Ok(())
558 }
559}