extern crate alloc;
use alloc::collections::VecDeque;
use std::io::{BufReader, Read};
use crate::read::Buffer;
#[derive(Debug, Copy, Clone, Default)]
pub enum QuotedChars {
#[default]
SingleOrDoubleQuotes,
DoubleQuotes,
SingleQuotes,
Other(u8),
}
#[derive(Clone)]
pub struct Token<T: Clone> {
search: Vec<u8>,
response: T,
escape_char: Option<u8>,
quote_char: Option<QuotedChars>,
}
impl<T: Clone> Token<T> {
pub fn new<S: AsRef<[u8]>>(search: S, response: T) -> Self {
Token {
search: search.as_ref().to_owned(),
response,
escape_char: None,
quote_char: None,
}
}
#[must_use]
pub fn with_escape_char(self, escape: u8) -> Self {
Token {
search: self.search,
response: self.response,
quote_char: self.quote_char,
escape_char: Some(escape),
}
}
#[must_use]
pub fn with_quote_char(self, quote_char: QuotedChars) -> Self {
Token {
search: self.search,
response: self.response,
quote_char: Some(quote_char),
escape_char: self.escape_char,
}
}
#[must_use]
pub fn get_search(&self) -> &[u8] {
self.search.as_ref()
}
#[must_use]
pub fn get_response(&self) -> &T {
&self.response
}
}
pub enum FoundToken<'a, T: Clone> {
Found { offset: usize, token: &'a Token<T> },
EndOfData { remaining_length: usize },
NotFound,
}
pub enum ReadToken<'a, T: Clone> {
Found { data: Vec<u8>, token: &'a Token<T> },
EndOfData { data: Vec<u8> },
NotFound,
}
struct TokenWorkingMem<'a, T: Clone> {
token: &'a Token<T>,
ringbuf: VecDeque<u8>,
found_escape: bool,
last_found_quote_char: Option<u8>,
offset: usize,
}
impl<'a, T: Clone> TokenWorkingMem<'a, T> {
pub fn new(token: &'a Token<T>) -> Self {
TokenWorkingMem {
token,
ringbuf: VecDeque::with_capacity(token.search.len()),
found_escape: false,
last_found_quote_char: None,
offset: 0,
}
}
pub fn is_full(&self) -> bool {
self.ringbuf.capacity() - self.ringbuf.len() == 0
}
pub fn push_back(&mut self, elem: u8) {
if !self.is_full() {
self.ringbuf.push_back(elem);
return;
}
let ret = self.ringbuf.pop_front();
if let Some(first) = ret {
self.offset += 1;
if let Some(esc) = self.token.escape_char {
self.found_escape = first == esc;
}
if let Some(quoted) = self.token.quote_char {
if let Some(last_char) = self.last_found_quote_char {
if last_char == first {
self.last_found_quote_char = None;
}
} else if match quoted {
QuotedChars::SingleOrDoubleQuotes => first == b'\'' || first == b'\"',
QuotedChars::DoubleQuotes => first == b'\"',
QuotedChars::SingleQuotes => first == b'\'',
QuotedChars::Other(o) => first == o,
} {
self.last_found_quote_char = Some(first);
}
}
}
self.ringbuf.push_back(elem);
}
pub fn matches(&self) -> bool {
if !self.is_full() {
return false;
}
if self.found_escape {
return false;
}
if self.last_found_quote_char.is_some() {
return false;
}
self.ringbuf.iter().eq(&self.token.search)
}
}
pub struct Scanner<T, R>
where
T: Read + Sized,
R: Clone,
{
reader: Buffer<BufReader<T>>,
tokens: Vec<Token<R>>,
}
impl<T: Read + Sized, R: Clone> Scanner<T, R> {
pub fn new(input: T, delimiters: &[Token<R>]) -> Self {
Scanner {
reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
tokens: Vec::from(delimiters),
}
}
pub fn with_max_lookahead(input: T, max_buffer: usize, delimiters: &[Token<R>]) -> Self {
Scanner {
reader: Buffer::new(BufReader::with_capacity(max_buffer, input)),
tokens: Vec::from(delimiters),
}
}
pub fn scan_until_next(&mut self) -> Result<FoundToken<R>, std::io::Error> {
let mut workingmem: Vec<TokenWorkingMem<R>> =
self.tokens.iter().map(TokenWorkingMem::new).collect();
let mut num_read = 0;
for char in &mut self.reader {
for mem in &mut workingmem {
mem.push_back(char);
if mem.matches() {
return Ok(FoundToken::Found {
offset: mem.offset,
token: mem.token,
});
}
}
num_read += 1;
}
Ok(FoundToken::EndOfData {
remaining_length: num_read,
})
}
pub fn read_next(&mut self) -> Result<ReadToken<R>, std::io::Error> {
let mut workingmem: Vec<TokenWorkingMem<R>> =
self.tokens.iter().map(TokenWorkingMem::new).collect();
for char in &mut self.reader {
for mem in &mut workingmem {
mem.push_back(char);
if mem.matches() {
let buf = self.reader.consume_read_buffer();
let mut data: Vec<u8> = buf.into();
data.truncate(mem.offset);
return Ok(ReadToken::Found {
data,
token: mem.token,
});
}
}
}
let buf = self.reader.consume_read_buffer();
if !buf.is_empty() {
let data: Vec<u8> = buf.into();
return Ok(ReadToken::EndOfData { data });
}
Ok(ReadToken::NotFound)
}
pub fn consume(&mut self, len: usize) {
self.reader.drain(..len);
}
pub fn take_back(self) -> Buffer<BufReader<T>> {
self.reader
}
}
impl<T: Read + Sized> Scanner<T, LineEnding> {
pub fn new_lf(input: T) -> Self {
Scanner {
reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
tokens: vec![Token::new("\n", LineEnding::LineFeed)],
}
}
pub fn new_crlf(input: T) -> Self {
Scanner {
reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
tokens: vec![Token::new("\r\n", LineEnding::CarriageReturnLineFeed)],
}
}
pub fn new_cr(input: T) -> Self {
Scanner {
reader: Buffer::new(BufReader::with_capacity(8 * 1024, input)),
tokens: vec![Token::new("\r", LineEnding::CarriageReturn)],
}
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum LineEnding {
LineFeed,
CarriageReturnLineFeed,
CarriageReturn,
}
#[cfg(test)]
mod tests {
use crate::scanner::*;
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum Tokens {
Space,
Other,
}
#[test]
pub fn test_scan_until() -> Result<(), std::io::Error> {
let data = "this is a basic test\nthis is a second line";
let delims = &[Token::new(b" ", Tokens::Space)];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 5, 9, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { remaining_length } => {
assert_eq!(remaining_length, exp);
}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[test]
pub fn test_scan_escaped() -> Result<(), std::io::Error> {
let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
let delims = &[Token::new(b" ", Tokens::Space).with_escape_char(b'\\')];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { .. } => {}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[test]
pub fn test_scan_quoted_double() -> Result<(), std::io::Error> {
let data = "this is a basic \"escaped\\ test\nthis\" is a second line";
let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::DoubleQuotes)];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { .. } => {}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[test]
pub fn test_scan_quoted_single() -> Result<(), std::io::Error> {
let data = "this is a basic \'escaped\\ test\nthis\' is a second line";
let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleQuotes)];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { .. } => {}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[test]
pub fn test_scan_quoted_other() -> Result<(), std::io::Error> {
let data = "this is a basic |escaped\\ test\nthis| is a second line";
let delims = &[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::Other(b'|'))];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 5, 20, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { .. } => {}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[test]
pub fn test_scan_quoted_both() -> Result<(), std::io::Error> {
let data = "this is a \"more\' advanced\" \'escaped\\ \"test\nthis\' is a second line";
let delims =
&[Token::new(b" ", Tokens::Space).with_quote_char(QuotedChars::SingleOrDoubleQuotes)];
let mut scanner = Scanner::new(data.as_bytes(), delims);
for exp in [4, 2, 1, 16, 21, 2, 1, 6, 4] {
match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, exp);
assert_eq!(token.response, Tokens::Space);
assert_ne!(token.response, Tokens::Other);
}
FoundToken::EndOfData { .. } => {}
FoundToken::NotFound => {
panic!("None not expected")
}
}
scanner.consume(exp);
}
Ok(())
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum CSVTokens {
Field,
Newline,
}
#[test]
pub fn test_scan_csv() -> Result<(), std::io::Error> {
let data = "name1,name2,name3,name4\r\nescaped\\,value1,\"quoted,value2\",\'quoted,value3\',\"long value\"\n\n";
let delims = &[
Token::new(b",", CSVTokens::Field)
.with_escape_char(b'\\')
.with_quote_char(QuotedChars::SingleOrDoubleQuotes),
Token::new(b"\r\n", CSVTokens::Newline),
Token::new(b"\n", CSVTokens::Newline),
];
let mut scanner = Scanner::new(data.as_bytes(), delims);
let exp = &[
(5, CSVTokens::Field),
(5, CSVTokens::Field),
(5, CSVTokens::Field),
(5, CSVTokens::Newline),
(15, CSVTokens::Field),
(15, CSVTokens::Field),
(15, CSVTokens::Field),
(12, CSVTokens::Newline),
(0, CSVTokens::Newline),
];
let mut ctr = 0;
for (exp_off, exp_ret) in exp {
let to_consume = match scanner.scan_until_next()? {
FoundToken::Found { token, offset } => {
assert_eq!(offset, *exp_off, "{ctr}{:?}", token.response);
assert_eq!(token.response, *exp_ret, "{ctr}");
token.search.len()
}
FoundToken::EndOfData { .. } => {
panic!("EOD Not expected {ctr}")
}
FoundToken::NotFound => {
panic!("None not expected {ctr}")
}
};
let consumed = exp_off + to_consume;
scanner.consume(consumed);
ctr += 1;
}
Ok(())
}
#[test]
pub fn test_three_delim() -> Result<(), std::io::Error> {
let data = "this is a test of the testing test";
let mut scanner = Scanner::new(data.as_bytes(), &[Token::new("test", "test")]);
for (exp_off, exp) in &[(10, "test"), (8, "test"), (4, "test")] {
let to_consume = match scanner.scan_until_next()? {
FoundToken::Found { offset, token } => {
assert_eq!(*exp_off, offset);
assert_eq!(*exp, token.response);
token.search.len()
}
FoundToken::EndOfData { remaining_length } => {
assert_eq!(remaining_length, 0);
remaining_length
}
FoundToken::NotFound => {
panic!("Not found");
}
};
scanner.consume(exp_off + to_consume);
}
Ok(())
}
}