sciforge_parser/csv/
parser.rs1use super::error::{CsvError, CsvErrorKind};
2use super::lexer::Cursor;
3use super::value::CsvValue;
4
5#[derive(Clone, Copy, Debug, PartialEq, Eq)]
6pub struct CsvLimits {
7 pub max_rows: usize,
8 pub max_columns: usize,
9 pub max_field_len: usize,
10 pub max_node_count: usize,
11}
12
13pub const DEFAULT_CSV_LIMITS: CsvLimits = CsvLimits {
14 max_rows: 1_000_000,
15 max_columns: 16_384,
16 max_field_len: 64 * 1024,
17 max_node_count: 2_000_000,
18};
19
20pub struct CsvParser<'a> {
21 cursor: Cursor<'a>,
22 limits: CsvLimits,
23 rows_seen: usize,
24 nodes_seen: usize,
25}
26
27impl<'a> CsvParser<'a> {
28 pub const fn new(bytes: &'a [u8]) -> Self {
29 Self {
30 cursor: Cursor::new(bytes),
31 limits: DEFAULT_CSV_LIMITS,
32 rows_seen: 0,
33 nodes_seen: 0,
34 }
35 }
36
37 pub const fn with_limits(mut self, limits: CsvLimits) -> Self {
38 self.limits = limits;
39 self
40 }
41
42 pub fn parse(mut self) -> Result<CsvValue<'a>, CsvError> {
43 self.parse_all()?;
44 Ok(CsvValue::Table)
45 }
46
47 pub fn validate(mut self) -> Result<(), CsvError> {
48 self.parse_all()
49 }
50
51 fn parse_all(&mut self) -> Result<(), CsvError> {
52 if self.cursor.is_eof() {
53 return Ok(());
54 }
55
56 loop {
57 self.parse_row()?;
58 if self.cursor.is_eof() {
59 return Ok(());
60 }
61 }
62 }
63
64 fn parse_row(&mut self) -> Result<(), CsvError> {
65 self.rows_seen = self.rows_seen.saturating_add(1);
66 if self.rows_seen > self.limits.max_rows {
67 return Err(CsvError::new(
68 CsvErrorKind::MaxRowsExceeded,
69 self.cursor.position(),
70 ));
71 }
72
73 let mut cols = 0usize;
74
75 loop {
76 self.parse_field()?;
77 cols = cols.saturating_add(1);
78 if cols > self.limits.max_columns {
79 return Err(CsvError::new(
80 CsvErrorKind::MaxColumnsExceeded,
81 self.cursor.position(),
82 ));
83 }
84
85 match self.cursor.peek() {
86 Some(b',') => {
87 self.cursor.advance(1);
88 continue;
89 }
90 Some(b'\n') => {
91 self.cursor.advance(1);
92 break;
93 }
94 Some(b'\r') => {
95 self.cursor.advance(1);
96 if self.cursor.peek() == Some(b'\n') {
97 self.cursor.advance(1);
98 }
99 break;
100 }
101 None => break,
102 _ => {
103 return Err(CsvError::new(
104 CsvErrorKind::TrailingCharactersAfterQuote,
105 self.cursor.position(),
106 ));
107 }
108 }
109 }
110
111 Ok(())
112 }
113
114 fn parse_field(&mut self) -> Result<(), CsvError> {
115 self.nodes_seen = self.nodes_seen.saturating_add(1);
116 if self.nodes_seen > self.limits.max_node_count {
117 return Err(CsvError::new(
118 CsvErrorKind::MaxNodeCountExceeded,
119 self.cursor.position(),
120 ));
121 }
122
123 match self.cursor.peek() {
124 Some(b'"') => self.parse_quoted_field(),
125 _ => self.parse_unquoted_field(),
126 }
127 }
128
129 fn parse_unquoted_field(&mut self) -> Result<(), CsvError> {
130 let start = self.cursor.position();
131
132 while let Some(b) = self.cursor.peek() {
133 if b == b',' || b == b'\n' || b == b'\r' {
134 break;
135 }
136 if b == b'"' {
137 return Err(CsvError::new(
138 CsvErrorKind::UnexpectedQuote,
139 self.cursor.position(),
140 ));
141 }
142 self.cursor.advance(1);
143 }
144
145 let end = self.cursor.position();
146 let len = end.saturating_sub(start);
147 if len > self.limits.max_field_len {
148 return Err(CsvError::new(CsvErrorKind::MaxFieldLengthExceeded, start));
149 }
150
151 core::str::from_utf8(&self.cursor.bytes()[start..end])
152 .map_err(|_| CsvError::new(CsvErrorKind::InvalidUtf8, start))?;
153
154 Ok(())
155 }
156
157 fn parse_quoted_field(&mut self) -> Result<(), CsvError> {
158 let quote_start = self.cursor.position();
159 self.cursor.next();
160 let content_start = self.cursor.position();
161
162 loop {
163 let b = self.cursor.next().ok_or(CsvError::new(
164 CsvErrorKind::UnterminatedQuotedField,
165 quote_start,
166 ))?;
167
168 if b == b'"' {
169 if self.cursor.peek() == Some(b'"') {
170 self.cursor.advance(1);
171 continue;
172 }
173 break;
174 }
175 }
176
177 let content_end = self.cursor.position().saturating_sub(1);
178 let len = content_end.saturating_sub(content_start);
179 if len > self.limits.max_field_len {
180 return Err(CsvError::new(
181 CsvErrorKind::MaxFieldLengthExceeded,
182 content_start,
183 ));
184 }
185
186 core::str::from_utf8(&self.cursor.bytes()[content_start..content_end])
187 .map_err(|_| CsvError::new(CsvErrorKind::InvalidUtf8, content_start))?;
188
189 match self.cursor.peek() {
190 Some(b',') | Some(b'\n') | Some(b'\r') | None => Ok(()),
191 Some(_) => Err(CsvError::new(
192 CsvErrorKind::TrailingCharactersAfterQuote,
193 self.cursor.position(),
194 )),
195 }
196 }
197}
198
199pub fn parse_csv(bytes: &[u8]) -> Result<CsvValue<'_>, CsvError> {
200 CsvParser::new(bytes).parse()
201}
202
203pub fn parse_csv_with_limits(bytes: &[u8], limits: CsvLimits) -> Result<CsvValue<'_>, CsvError> {
204 CsvParser::new(bytes).with_limits(limits).parse()
205}
206
207pub fn validate_csv(bytes: &[u8]) -> Result<(), CsvError> {
208 CsvParser::new(bytes).validate()
209}