Skip to main content

sciforge_parser/csv/
parser.rs

1use super::error::{CsvError, CsvErrorKind};
2use super::lexer::Cursor;
3use super::value::CsvValue;
4
5#[derive(Clone, Copy, Debug, PartialEq, Eq)]
6pub struct CsvLimits {
7    pub max_rows: usize,
8    pub max_columns: usize,
9    pub max_field_len: usize,
10    pub max_node_count: usize,
11}
12
13pub const DEFAULT_CSV_LIMITS: CsvLimits = CsvLimits {
14    max_rows: 1_000_000,
15    max_columns: 16_384,
16    max_field_len: 64 * 1024,
17    max_node_count: 2_000_000,
18};
19
20pub struct CsvParser<'a> {
21    cursor: Cursor<'a>,
22    limits: CsvLimits,
23    rows_seen: usize,
24    nodes_seen: usize,
25}
26
27impl<'a> CsvParser<'a> {
28    pub const fn new(bytes: &'a [u8]) -> Self {
29        Self {
30            cursor: Cursor::new(bytes),
31            limits: DEFAULT_CSV_LIMITS,
32            rows_seen: 0,
33            nodes_seen: 0,
34        }
35    }
36
37    pub const fn with_limits(mut self, limits: CsvLimits) -> Self {
38        self.limits = limits;
39        self
40    }
41
42    pub fn parse(mut self) -> Result<CsvValue<'a>, CsvError> {
43        self.parse_all()?;
44        Ok(CsvValue::Table)
45    }
46
47    pub fn validate(mut self) -> Result<(), CsvError> {
48        self.parse_all()
49    }
50
51    fn parse_all(&mut self) -> Result<(), CsvError> {
52        if self.cursor.is_eof() {
53            return Ok(());
54        }
55
56        loop {
57            self.parse_row()?;
58            if self.cursor.is_eof() {
59                return Ok(());
60            }
61        }
62    }
63
64    fn parse_row(&mut self) -> Result<(), CsvError> {
65        self.rows_seen = self.rows_seen.saturating_add(1);
66        if self.rows_seen > self.limits.max_rows {
67            return Err(CsvError::new(
68                CsvErrorKind::MaxRowsExceeded,
69                self.cursor.position(),
70            ));
71        }
72
73        let mut cols = 0usize;
74
75        loop {
76            self.parse_field()?;
77            cols = cols.saturating_add(1);
78            if cols > self.limits.max_columns {
79                return Err(CsvError::new(
80                    CsvErrorKind::MaxColumnsExceeded,
81                    self.cursor.position(),
82                ));
83            }
84
85            match self.cursor.peek() {
86                Some(b',') => {
87                    self.cursor.advance(1);
88                    continue;
89                }
90                Some(b'\n') => {
91                    self.cursor.advance(1);
92                    break;
93                }
94                Some(b'\r') => {
95                    self.cursor.advance(1);
96                    if self.cursor.peek() == Some(b'\n') {
97                        self.cursor.advance(1);
98                    }
99                    break;
100                }
101                None => break,
102                _ => {
103                    return Err(CsvError::new(
104                        CsvErrorKind::TrailingCharactersAfterQuote,
105                        self.cursor.position(),
106                    ));
107                }
108            }
109        }
110
111        Ok(())
112    }
113
114    fn parse_field(&mut self) -> Result<(), CsvError> {
115        self.nodes_seen = self.nodes_seen.saturating_add(1);
116        if self.nodes_seen > self.limits.max_node_count {
117            return Err(CsvError::new(
118                CsvErrorKind::MaxNodeCountExceeded,
119                self.cursor.position(),
120            ));
121        }
122
123        match self.cursor.peek() {
124            Some(b'"') => self.parse_quoted_field(),
125            _ => self.parse_unquoted_field(),
126        }
127    }
128
129    fn parse_unquoted_field(&mut self) -> Result<(), CsvError> {
130        let start = self.cursor.position();
131
132        while let Some(b) = self.cursor.peek() {
133            if b == b',' || b == b'\n' || b == b'\r' {
134                break;
135            }
136            if b == b'"' {
137                return Err(CsvError::new(
138                    CsvErrorKind::UnexpectedQuote,
139                    self.cursor.position(),
140                ));
141            }
142            self.cursor.advance(1);
143        }
144
145        let end = self.cursor.position();
146        let len = end.saturating_sub(start);
147        if len > self.limits.max_field_len {
148            return Err(CsvError::new(CsvErrorKind::MaxFieldLengthExceeded, start));
149        }
150
151        core::str::from_utf8(&self.cursor.bytes()[start..end])
152            .map_err(|_| CsvError::new(CsvErrorKind::InvalidUtf8, start))?;
153
154        Ok(())
155    }
156
157    fn parse_quoted_field(&mut self) -> Result<(), CsvError> {
158        let quote_start = self.cursor.position();
159        self.cursor.next();
160        let content_start = self.cursor.position();
161
162        loop {
163            let b = self.cursor.next().ok_or(CsvError::new(
164                CsvErrorKind::UnterminatedQuotedField,
165                quote_start,
166            ))?;
167
168            if b == b'"' {
169                if self.cursor.peek() == Some(b'"') {
170                    self.cursor.advance(1);
171                    continue;
172                }
173                break;
174            }
175        }
176
177        let content_end = self.cursor.position().saturating_sub(1);
178        let len = content_end.saturating_sub(content_start);
179        if len > self.limits.max_field_len {
180            return Err(CsvError::new(
181                CsvErrorKind::MaxFieldLengthExceeded,
182                content_start,
183            ));
184        }
185
186        core::str::from_utf8(&self.cursor.bytes()[content_start..content_end])
187            .map_err(|_| CsvError::new(CsvErrorKind::InvalidUtf8, content_start))?;
188
189        match self.cursor.peek() {
190            Some(b',') | Some(b'\n') | Some(b'\r') | None => Ok(()),
191            Some(_) => Err(CsvError::new(
192                CsvErrorKind::TrailingCharactersAfterQuote,
193                self.cursor.position(),
194            )),
195        }
196    }
197}
198
199pub fn parse_csv(bytes: &[u8]) -> Result<CsvValue<'_>, CsvError> {
200    CsvParser::new(bytes).parse()
201}
202
203pub fn parse_csv_with_limits(bytes: &[u8], limits: CsvLimits) -> Result<CsvValue<'_>, CsvError> {
204    CsvParser::new(bytes).with_limits(limits).parse()
205}
206
207pub fn validate_csv(bytes: &[u8]) -> Result<(), CsvError> {
208    CsvParser::new(bytes).validate()
209}