irox_csv/
tokenizers.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2023 IROX Contributors
3
4use std::io::Read;
5
6use irox_tools::scanner as sc;
7use irox_tools::scanner::{QuotedChars, ReadToken, Scanner};
8
9use crate::error::CSVError;
10use crate::Dialect;
11
12///
13/// Output from the Tokenizers as they detects individual tokens from the input stream.
14#[derive(Debug, Clone)]
15pub enum Token {
16    Field(String),
17    EndRow,
18    Comment(String),
19}
20
21#[derive(Clone)]
22enum InnerToken {
23    Field,
24    Newline,
25    Comment,
26}
27
28///
29/// A Token Reader reads tokens from input stream
30pub trait TokenReader {
31    ///
32    /// Attempts to scan the line and return the immediate next set of [`Token`]s it finds.
33    /// Call this function repeatedly until it returns `Ok(None)` or errors.
34    fn next_tokens(&mut self) -> Result<Option<Vec<Token>>, CSVError>;
35}
36
37///
38/// A Token Writer writes the set of specified tokens to the output stream
39pub trait TokenWriter {
40    ///
41    /// Attempts to write these tokens to the output stream.
42    fn write_tokens(&mut self, tokens: &[Token]) -> Result<(), CSVError>;
43}
44
45///
46/// Scans the provided input stream and outputs [`Token`]s as it detects them.
47pub struct BasicTokenReader<T>
48where
49    T: Read + Sized,
50{
51    scanner: Scanner<T, InnerToken>,
52}
53
54impl<T: Read + Sized> BasicTokenReader<T> {
55    ///
56    /// Creates a new Tokenizer using the default RFC4180 Dialect, consuming the
57    /// underlying reader.
58    pub fn new(reader: T) -> Self {
59        let dialect = Dialect::default();
60        Self::dialect(reader, dialect)
61    }
62
63    ///
64    /// Token reader using the specified dialect
65    pub fn dialect(reader: T, dialect: Dialect) -> Self {
66        let delims = &[
67            sc::Token::new(dialect.get_field_separators(), InnerToken::Field)
68                .with_quote_char(QuotedChars::DoubleQuotes),
69            sc::Token::new(dialect.get_line_separators(), InnerToken::Newline)
70                .with_quote_char(QuotedChars::DoubleQuotes),
71            sc::Token::new(dialect.get_comment_chars(), InnerToken::Comment),
72        ];
73        Self {
74            scanner: Scanner::new(reader, delims),
75        }
76    }
77}
78
79impl<T: Read + Sized> TokenReader for BasicTokenReader<T> {
80    ///
81    /// Attempts to scan the line and return the immediate next set of [`Token`]s it finds.
82    /// The real brunt of the processing work is done here.
83    fn next_tokens(&mut self) -> Result<Option<Vec<Token>>, CSVError> {
84        match self.scanner.read_next()? {
85            ReadToken::Found { data, token } => {
86                let name = String::from_utf8_lossy(&data).to_string();
87                match token.get_response() {
88                    InnerToken::Field => Ok(Some(vec![Token::Field(name)])),
89                    InnerToken::Newline => Ok(Some(vec![Token::Field(name), Token::EndRow])),
90                    InnerToken::Comment => Ok(Some(vec![Token::Comment(name)])),
91                }
92            }
93            ReadToken::EndOfData { data } => Ok(Some(vec![
94                Token::Field(String::from_utf8_lossy(&data).to_string()),
95                Token::EndRow,
96            ])),
97            ReadToken::NotFound => Ok(None),
98        }
99    }
100}