Skip to main content

facet_csv/
parser.rs

1//! CSV parser implementation using FormatParser trait.
2
3extern crate alloc;
4
5use alloc::borrow::Cow;
6use alloc::vec::Vec;
7
8use facet_format::{
9    ContainerKind, DeserializeErrorKind, FormatParser, ParseError, ParseEvent, ParseEventKind,
10    SavePoint, ScalarTypeHint, ScalarValue,
11};
12use facet_reflect::Span;
13
14/// Parser state for CSV.
15#[derive(Debug, Clone)]
16enum ParserState {
17    /// Ready to start parsing.
18    Ready,
19    /// Inside a struct, tracking remaining fields.
20    InStruct { remaining_fields: usize },
21}
22
23/// A parsed field with its byte offset and length.
24#[derive(Debug, Clone, Copy)]
25struct FieldSpan<'de> {
26    value: &'de str,
27    offset: usize,
28    len: usize,
29}
30
31/// CSV parser that emits FormatParser events.
32///
33/// CSV is parsed as a struct where each comma-separated field corresponds
34/// to a struct field in definition order. The format does not support
35/// nested structures or arrays.
36///
37/// Unlike fully self-describing formats (JSON), CSV is positional:
38/// - Fields are identified by column order, not names
39/// - The parser uses `hint_struct_fields` to know how many fields to expect
40/// - Each field emits an `OrderedField` event followed by a `Scalar` value
41pub struct CsvParser<'de> {
42    input: &'de str,
43    fields: Vec<FieldSpan<'de>>,
44    field_index: usize,
45    state_stack: Vec<ParserState>,
46    peeked: Option<ParseEvent<'de>>,
47    /// Pending struct field count from `hint_struct_fields`.
48    pending_struct_fields: Option<usize>,
49    /// Pending scalar type hint from `hint_scalar_type`.
50    pending_scalar_type: Option<ScalarTypeHint>,
51}
52
53impl<'de> CsvParser<'de> {
54    /// Create a new CSV parser for a single row.
55    pub fn new(input: &'de str) -> Self {
56        let trimmed = input.trim();
57        // Calculate the offset of the trimmed content within the original input
58        let trim_offset = input.len() - input.trim_start().len();
59        let fields = if trimmed.is_empty() {
60            Vec::new()
61        } else {
62            parse_csv_row_with_spans(trimmed, trim_offset)
63        };
64
65        Self {
66            input,
67            fields,
68            field_index: 0,
69            state_stack: Vec::new(),
70            peeked: None,
71            pending_struct_fields: None,
72            pending_scalar_type: None,
73        }
74    }
75
76    /// Get the current parser state.
77    fn current_state(&self) -> &ParserState {
78        self.state_stack.last().unwrap_or(&ParserState::Ready)
79    }
80
81    /// Get the span for the current field, or EOF span if past the end.
82    fn current_field_span(&self) -> Span {
83        if self.field_index > 0 && self.field_index <= self.fields.len() {
84            let field = &self.fields[self.field_index - 1];
85            Span::new(field.offset, field.len)
86        } else {
87            // EOF span
88            Span::new(self.input.len(), 0)
89        }
90    }
91
92    /// Generate the next event based on current state.
93    fn generate_next_event(&mut self) -> Result<ParseEvent<'de>, ParseError> {
94        // Check if we have a pending scalar type hint
95        if let Some(hint) = self.pending_scalar_type.take() {
96            if self.field_index > 0 && self.field_index <= self.fields.len() {
97                let field = &self.fields[self.field_index - 1];
98                return Ok(self.event(ParseEventKind::Scalar(parse_scalar_with_hint(
99                    field.value,
100                    hint,
101                ))));
102            } else {
103                return Err(ParseError::new(
104                    Span::new(self.input.len(), 0),
105                    DeserializeErrorKind::UnexpectedEof {
106                        expected: "field for scalar hint",
107                    },
108                ));
109            }
110        }
111
112        // Check if we have a pending struct hint
113        if let Some(num_fields) = self.pending_struct_fields.take() {
114            self.state_stack.push(ParserState::InStruct {
115                remaining_fields: num_fields,
116            });
117            return Ok(self.event(ParseEventKind::StructStart(ContainerKind::Object)));
118        }
119
120        // Process based on current state
121        match self.current_state().clone() {
122            ParserState::Ready => {
123                // Without a hint, we can't know how many fields to expect
124                // Return an error - the driver should call hint_struct_fields first
125                Err(ParseError::new(
126                    Span::new(0, self.input.len()),
127                    DeserializeErrorKind::InvalidValue {
128                        message: "CSV parser requires hint_struct_fields to know field count"
129                            .into(),
130                    },
131                ))
132            }
133            ParserState::InStruct { remaining_fields } => {
134                if remaining_fields == 0 {
135                    // Struct complete
136                    self.state_stack.pop();
137                    Ok(self.event(ParseEventKind::StructEnd))
138                } else {
139                    // More fields to go - emit OrderedField and decrement
140                    if let Some(ParserState::InStruct { remaining_fields }) =
141                        self.state_stack.last_mut()
142                    {
143                        *remaining_fields -= 1;
144                    }
145                    // Advance field index when emitting OrderedField
146                    self.field_index += 1;
147                    Ok(self.event(ParseEventKind::OrderedField))
148                }
149            }
150        }
151    }
152}
153
154/// Parse a CSV row into fields with spans, handling quoted fields.
155fn parse_csv_row_with_spans(input: &str, base_offset: usize) -> Vec<FieldSpan<'_>> {
156    let mut fields = Vec::new();
157    let mut in_quotes = false;
158    let mut field_start = 0;
159    let bytes = input.as_bytes();
160
161    for (i, &b) in bytes.iter().enumerate() {
162        match b {
163            b'"' => {
164                in_quotes = !in_quotes;
165            }
166            b',' if !in_quotes => {
167                let field = &input[field_start..i];
168                let (value, value_offset) = unquote_field_with_offset(field, field_start);
169                fields.push(FieldSpan {
170                    value,
171                    offset: base_offset + value_offset,
172                    len: value.len(),
173                });
174                field_start = i + 1;
175            }
176            _ => {}
177        }
178    }
179
180    // Add the last field
181    let field = &input[field_start..];
182    let (value, value_offset) = unquote_field_with_offset(field, field_start);
183    fields.push(FieldSpan {
184        value,
185        offset: base_offset + value_offset,
186        len: value.len(),
187    });
188
189    fields
190}
191
192/// Remove surrounding quotes from a field if present, returning value and offset.
193fn unquote_field_with_offset(field: &str, field_start: usize) -> (&str, usize) {
194    let trim_start = field.len() - field.trim_start().len();
195    let trimmed = field.trim();
196    if trimmed.starts_with('"') && trimmed.ends_with('"') && trimmed.len() >= 2 {
197        // +1 for the opening quote
198        (&trimmed[1..trimmed.len() - 1], field_start + trim_start + 1)
199    } else {
200        (trimmed, field_start + trim_start)
201    }
202}
203
204/// Parse a scalar value with the given type hint.
205fn parse_scalar_with_hint(value: &str, hint: ScalarTypeHint) -> ScalarValue<'_> {
206    match hint {
207        ScalarTypeHint::Bool => {
208            let val = matches!(value, "true" | "TRUE" | "1" | "yes" | "YES");
209            ScalarValue::Bool(val)
210        }
211        ScalarTypeHint::U8
212        | ScalarTypeHint::U16
213        | ScalarTypeHint::U32
214        | ScalarTypeHint::U64
215        | ScalarTypeHint::Usize => {
216            if let Ok(n) = value.parse::<u64>() {
217                ScalarValue::U64(n)
218            } else {
219                // Fall back to string if parsing fails
220                ScalarValue::Str(Cow::Borrowed(value))
221            }
222        }
223        ScalarTypeHint::U128 => {
224            if let Ok(n) = value.parse::<u128>() {
225                ScalarValue::U128(n)
226            } else {
227                ScalarValue::Str(Cow::Borrowed(value))
228            }
229        }
230        ScalarTypeHint::I8
231        | ScalarTypeHint::I16
232        | ScalarTypeHint::I32
233        | ScalarTypeHint::I64
234        | ScalarTypeHint::Isize => {
235            if let Ok(n) = value.parse::<i64>() {
236                ScalarValue::I64(n)
237            } else {
238                ScalarValue::Str(Cow::Borrowed(value))
239            }
240        }
241        ScalarTypeHint::I128 => {
242            if let Ok(n) = value.parse::<i128>() {
243                ScalarValue::I128(n)
244            } else {
245                ScalarValue::Str(Cow::Borrowed(value))
246            }
247        }
248        ScalarTypeHint::F32 | ScalarTypeHint::F64 => {
249            if let Ok(n) = value.parse::<f64>() {
250                ScalarValue::F64(n)
251            } else {
252                ScalarValue::Str(Cow::Borrowed(value))
253            }
254        }
255        ScalarTypeHint::String | ScalarTypeHint::Char => ScalarValue::Str(Cow::Borrowed(value)),
256        ScalarTypeHint::Bytes => {
257            // Bytes in CSV are typically base64 or hex encoded
258            // For now, just return as string and let the deserializer handle it
259            ScalarValue::Str(Cow::Borrowed(value))
260        }
261    }
262}
263
264impl<'de> FormatParser<'de> for CsvParser<'de> {
265    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, ParseError> {
266        // Return peeked event if available
267        if let Some(event) = self.peeked.take() {
268            return Ok(Some(event));
269        }
270        Ok(Some(self.generate_next_event()?))
271    }
272
273    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, ParseError> {
274        if self.peeked.is_none() {
275            self.peeked = Some(self.generate_next_event()?);
276        }
277        Ok(self.peeked.clone())
278    }
279
280    fn skip_value(&mut self) -> Result<(), ParseError> {
281        // Skip the current field by advancing index
282        if self.field_index < self.fields.len() {
283            self.field_index += 1;
284        }
285        Ok(())
286    }
287
288    fn save(&mut self) -> SavePoint {
289        // CSV is positional - save/restore not meaningful
290        unimplemented!("save/restore not supported for CSV (positional format)")
291    }
292
293    fn restore(&mut self, _save_point: SavePoint) {
294        unimplemented!("save/restore not supported for CSV (positional format)")
295    }
296
297    fn is_self_describing(&self) -> bool {
298        // CSV is NOT self-describing in the facet-format sense:
299        // - It doesn't have field names in the data
300        // - It relies on position/order for field identification
301        // This tells the deserializer to use hint_struct_fields/hint_scalar_type
302        false
303    }
304
305    fn hint_struct_fields(&mut self, num_fields: usize) {
306        self.pending_struct_fields = Some(num_fields);
307        // Clear any peeked OrderedField placeholder
308        if matches!(
309            self.peeked.as_ref().map(|e| &e.kind),
310            Some(ParseEventKind::OrderedField)
311        ) {
312            self.peeked = None;
313        }
314    }
315
316    fn hint_scalar_type(&mut self, hint: ScalarTypeHint) {
317        self.pending_scalar_type = Some(hint);
318        // Clear any peeked OrderedField placeholder
319        if matches!(
320            self.peeked.as_ref().map(|e| &e.kind),
321            Some(ParseEventKind::OrderedField)
322        ) {
323            self.peeked = None;
324        }
325    }
326
327    fn current_span(&self) -> Option<Span> {
328        Some(self.current_field_span())
329    }
330}
331
332impl<'de> CsvParser<'de> {
333    /// Create an event with the current span.
334    #[inline]
335    fn event(&self, kind: ParseEventKind<'de>) -> ParseEvent<'de> {
336        ParseEvent::new(kind, self.current_field_span())
337    }
338}