Skip to main content

facet_csv/
parser.rs

1//! CSV parser implementation using FormatParser trait.
2
3extern crate alloc;
4
5use alloc::borrow::Cow;
6use alloc::vec::Vec;
7
8use facet_format::{
9    ContainerKind, FieldEvidence, FormatParser, ParseEvent, ProbeStream, ScalarTypeHint,
10    ScalarValue,
11};
12
13use crate::error::{CsvError, CsvErrorKind};
14
15/// Parser state for CSV.
16#[derive(Debug, Clone)]
17enum ParserState {
18    /// Ready to start parsing.
19    Ready,
20    /// Inside a struct, tracking remaining fields.
21    InStruct { remaining_fields: usize },
22}
23
24/// CSV parser that emits FormatParser events.
25///
26/// CSV is parsed as a struct where each comma-separated field corresponds
27/// to a struct field in definition order. The format does not support
28/// nested structures or arrays.
29///
30/// Unlike fully self-describing formats (JSON), CSV is positional:
31/// - Fields are identified by column order, not names
32/// - The parser uses `hint_struct_fields` to know how many fields to expect
33/// - Each field emits an `OrderedField` event followed by a `Scalar` value
34pub struct CsvParser<'de> {
35    fields: Vec<&'de str>,
36    field_index: usize,
37    state_stack: Vec<ParserState>,
38    peeked: Option<ParseEvent<'de>>,
39    /// Pending struct field count from `hint_struct_fields`.
40    pending_struct_fields: Option<usize>,
41    /// Pending scalar type hint from `hint_scalar_type`.
42    pending_scalar_type: Option<ScalarTypeHint>,
43}
44
45impl<'de> CsvParser<'de> {
46    /// Create a new CSV parser for a single row.
47    pub fn new(input: &'de str) -> Self {
48        let input = input.trim();
49        let fields: Vec<&str> = if input.is_empty() {
50            Vec::new()
51        } else {
52            parse_csv_row(input)
53        };
54
55        Self {
56            fields,
57            field_index: 0,
58            state_stack: Vec::new(),
59            peeked: None,
60            pending_struct_fields: None,
61            pending_scalar_type: None,
62        }
63    }
64
65    /// Get the current parser state.
66    fn current_state(&self) -> &ParserState {
67        self.state_stack.last().unwrap_or(&ParserState::Ready)
68    }
69
70    /// Generate the next event based on current state.
71    fn generate_next_event(&mut self) -> Result<ParseEvent<'de>, CsvError> {
72        // Check if we have a pending scalar type hint
73        if let Some(hint) = self.pending_scalar_type.take() {
74            if self.field_index > 0 && self.field_index <= self.fields.len() {
75                let field_value = self.fields[self.field_index - 1];
76                return Ok(ParseEvent::Scalar(parse_scalar_with_hint(
77                    field_value,
78                    hint,
79                )));
80            } else {
81                return Err(CsvError::new(CsvErrorKind::UnexpectedEof {
82                    expected: "field for scalar hint",
83                }));
84            }
85        }
86
87        // Check if we have a pending struct hint
88        if let Some(num_fields) = self.pending_struct_fields.take() {
89            self.state_stack.push(ParserState::InStruct {
90                remaining_fields: num_fields,
91            });
92            return Ok(ParseEvent::StructStart(ContainerKind::Object));
93        }
94
95        // Process based on current state
96        match self.current_state().clone() {
97            ParserState::Ready => {
98                // Without a hint, we can't know how many fields to expect
99                // Return an error - the driver should call hint_struct_fields first
100                Err(CsvError::new(CsvErrorKind::UnsupportedType {
101                    type_name: "CSV parser requires hint_struct_fields to know field count",
102                }))
103            }
104            ParserState::InStruct { remaining_fields } => {
105                if remaining_fields == 0 {
106                    // Struct complete
107                    self.state_stack.pop();
108                    Ok(ParseEvent::StructEnd)
109                } else {
110                    // More fields to go - emit OrderedField and decrement
111                    if let Some(ParserState::InStruct { remaining_fields }) =
112                        self.state_stack.last_mut()
113                    {
114                        *remaining_fields -= 1;
115                    }
116                    // Advance field index when emitting OrderedField
117                    self.field_index += 1;
118                    Ok(ParseEvent::OrderedField)
119                }
120            }
121        }
122    }
123}
124
125/// Parse a CSV row into fields, handling quoted fields.
126fn parse_csv_row(input: &str) -> Vec<&str> {
127    let mut fields = Vec::new();
128    let mut in_quotes = false;
129    let mut field_start = 0;
130    let bytes = input.as_bytes();
131
132    for (i, &b) in bytes.iter().enumerate() {
133        match b {
134            b'"' => {
135                in_quotes = !in_quotes;
136            }
137            b',' if !in_quotes => {
138                let field = &input[field_start..i];
139                fields.push(unquote_field(field));
140                field_start = i + 1;
141            }
142            _ => {}
143        }
144    }
145
146    // Add the last field
147    let field = &input[field_start..];
148    fields.push(unquote_field(field));
149
150    fields
151}
152
153/// Remove surrounding quotes from a field if present.
154fn unquote_field(field: &str) -> &str {
155    let trimmed = field.trim();
156    if trimmed.starts_with('"') && trimmed.ends_with('"') && trimmed.len() >= 2 {
157        &trimmed[1..trimmed.len() - 1]
158    } else {
159        trimmed
160    }
161}
162
163/// Parse a scalar value with the given type hint.
164fn parse_scalar_with_hint(value: &str, hint: ScalarTypeHint) -> ScalarValue<'_> {
165    match hint {
166        ScalarTypeHint::Bool => {
167            let val = matches!(value, "true" | "TRUE" | "1" | "yes" | "YES");
168            ScalarValue::Bool(val)
169        }
170        ScalarTypeHint::U8
171        | ScalarTypeHint::U16
172        | ScalarTypeHint::U32
173        | ScalarTypeHint::U64
174        | ScalarTypeHint::Usize => {
175            if let Ok(n) = value.parse::<u64>() {
176                ScalarValue::U64(n)
177            } else {
178                // Fall back to string if parsing fails
179                ScalarValue::Str(Cow::Borrowed(value))
180            }
181        }
182        ScalarTypeHint::U128 => {
183            if let Ok(n) = value.parse::<u128>() {
184                ScalarValue::U128(n)
185            } else {
186                ScalarValue::Str(Cow::Borrowed(value))
187            }
188        }
189        ScalarTypeHint::I8
190        | ScalarTypeHint::I16
191        | ScalarTypeHint::I32
192        | ScalarTypeHint::I64
193        | ScalarTypeHint::Isize => {
194            if let Ok(n) = value.parse::<i64>() {
195                ScalarValue::I64(n)
196            } else {
197                ScalarValue::Str(Cow::Borrowed(value))
198            }
199        }
200        ScalarTypeHint::I128 => {
201            if let Ok(n) = value.parse::<i128>() {
202                ScalarValue::I128(n)
203            } else {
204                ScalarValue::Str(Cow::Borrowed(value))
205            }
206        }
207        ScalarTypeHint::F32 | ScalarTypeHint::F64 => {
208            if let Ok(n) = value.parse::<f64>() {
209                ScalarValue::F64(n)
210            } else {
211                ScalarValue::Str(Cow::Borrowed(value))
212            }
213        }
214        ScalarTypeHint::String | ScalarTypeHint::Char => ScalarValue::Str(Cow::Borrowed(value)),
215        ScalarTypeHint::Bytes => {
216            // Bytes in CSV are typically base64 or hex encoded
217            // For now, just return as string and let the deserializer handle it
218            ScalarValue::Str(Cow::Borrowed(value))
219        }
220    }
221}
222
223impl<'de> FormatParser<'de> for CsvParser<'de> {
224    type Error = CsvError;
225    type Probe<'a>
226        = CsvProbe
227    where
228        Self: 'a;
229
230    fn next_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
231        // Return peeked event if available
232        if let Some(event) = self.peeked.take() {
233            return Ok(Some(event));
234        }
235        Ok(Some(self.generate_next_event()?))
236    }
237
238    fn peek_event(&mut self) -> Result<Option<ParseEvent<'de>>, Self::Error> {
239        if self.peeked.is_none() {
240            self.peeked = Some(self.generate_next_event()?);
241        }
242        Ok(self.peeked.clone())
243    }
244
245    fn skip_value(&mut self) -> Result<(), Self::Error> {
246        // Skip the current field by advancing index
247        if self.field_index < self.fields.len() {
248            self.field_index += 1;
249        }
250        Ok(())
251    }
252
253    fn begin_probe(&mut self) -> Result<Self::Probe<'_>, Self::Error> {
254        // CSV doesn't support probing for field names (it's positional)
255        Ok(CsvProbe)
256    }
257
258    fn is_self_describing(&self) -> bool {
259        // CSV is NOT self-describing in the facet-format sense:
260        // - It doesn't have field names in the data
261        // - It relies on position/order for field identification
262        // This tells the deserializer to use hint_struct_fields/hint_scalar_type
263        false
264    }
265
266    fn hint_struct_fields(&mut self, num_fields: usize) {
267        self.pending_struct_fields = Some(num_fields);
268        // Clear any peeked OrderedField placeholder
269        if matches!(self.peeked, Some(ParseEvent::OrderedField)) {
270            self.peeked = None;
271        }
272    }
273
274    fn hint_scalar_type(&mut self, hint: ScalarTypeHint) {
275        self.pending_scalar_type = Some(hint);
276        // Clear any peeked OrderedField placeholder
277        if matches!(self.peeked, Some(ParseEvent::OrderedField)) {
278            self.peeked = None;
279        }
280    }
281}
282
283/// Empty probe for CSV - no field evidence since CSV is positional.
284pub struct CsvProbe;
285
286impl<'de> ProbeStream<'de> for CsvProbe {
287    type Error = CsvError;
288
289    fn next(&mut self) -> Result<Option<FieldEvidence<'de>>, Self::Error> {
290        // CSV doesn't have named fields, so no evidence to provide
291        Ok(None)
292    }
293}