Skip to main content

u_insight/
csv_parser.rs

1//! CSV parser with automatic type inference.
2//!
3//! Parses CSV files into a [`DataFrame`]
4//! with column types automatically inferred from content. The inference
5//! priority is: Numeric → Boolean → Categorical → Text.
6//!
7//! # Features
8//!
9//! - RFC 4180 compliant (quoted fields, escaped quotes, commas in fields)
10//! - Automatic type inference per column
11//! - Standard null markers recognized: empty, `NA`, `N/A`, `null`, `NULL`, `None`, `.`
12//! - Low-cardinality strings are dictionary-encoded as Categorical
13//! - Configurable delimiter and null markers
14//!
15//! # Example
16//!
17//! ```
18//! use u_insight::csv_parser::CsvParser;
19//! use u_insight::dataframe::DataType;
20//!
21//! let csv = "name,value,active\nAlice,1.5,true\nBob,2.3,false\n";
22//! let df = CsvParser::new().parse_str(csv).unwrap();
23//! assert_eq!(df.row_count(), 2);
24//! assert_eq!(df.column_count(), 3);
25//! assert_eq!(df.column(0).unwrap().data_type(), DataType::Text);
26//! assert_eq!(df.column(1).unwrap().data_type(), DataType::Numeric);
27//! assert_eq!(df.column(2).unwrap().data_type(), DataType::Boolean);
28//! ```
29
30use crate::dataframe::{Column, DataFrame, DataType, ValidityBitmap};
31use crate::error::InsightError;
32use std::collections::HashMap;
33
34/// Standard null value markers recognized during parsing.
35const DEFAULT_NULL_MARKERS: &[&str] = &[
36    "", "NA", "N/A", "na", "n/a", "null", "NULL", "None", "none", ".", "NaN", "nan", "NAN", "#N/A",
37    "#NA",
38];
39
40/// Maximum unique-value ratio for a column to be classified as Categorical
41/// instead of Text. Default: 50%.
42const CATEGORICAL_THRESHOLD: f64 = 0.5;
43
44/// Maximum dictionary size for categorical columns.
45const MAX_CATEGORICAL_UNIQUE: usize = 1000;
46
47/// CSV parser configuration and entry point.
48///
49/// ```
50/// use u_insight::csv_parser::CsvParser;
51///
52/// let csv = "a,b\n1,2\n3,4\n";
53/// let df = CsvParser::new().parse_str(csv).unwrap();
54/// assert_eq!(df.row_count(), 2);
55/// ```
56#[derive(Debug, Clone)]
57pub struct CsvParser {
58    delimiter: u8,
59    has_header: bool,
60    null_markers: Vec<String>,
61}
62
63impl CsvParser {
64    /// Creates a parser with default settings (comma delimiter, header row, standard null markers).
65    pub fn new() -> Self {
66        Self {
67            delimiter: b',',
68            has_header: true,
69            null_markers: DEFAULT_NULL_MARKERS
70                .iter()
71                .map(|s| (*s).to_string())
72                .collect(),
73        }
74    }
75
76    /// Sets the field delimiter (default: comma).
77    pub fn delimiter(mut self, delim: u8) -> Self {
78        self.delimiter = delim;
79        self
80    }
81
82    /// Sets whether the first row is a header (default: true).
83    pub fn has_header(mut self, header: bool) -> Self {
84        self.has_header = header;
85        self
86    }
87
88    /// Sets custom null markers (replaces defaults).
89    pub fn null_markers(mut self, markers: Vec<String>) -> Self {
90        self.null_markers = markers;
91        self
92    }
93
94    /// Parses a CSV string into a DataFrame.
95    pub fn parse_str(&self, input: &str) -> Result<DataFrame, InsightError> {
96        // Strip BOM if present
97        let input = input.strip_prefix('\u{feff}').unwrap_or(input);
98
99        // Parse into raw rows
100        let raw_rows = self.parse_raw(input)?;
101        if raw_rows.is_empty() {
102            return Ok(DataFrame::new());
103        }
104
105        // Extract header
106        let (headers, data_rows) = if self.has_header {
107            if raw_rows.is_empty() {
108                return Ok(DataFrame::new());
109            }
110            let headers: Vec<String> = raw_rows[0].clone();
111            (headers, &raw_rows[1..])
112        } else {
113            let n_cols = raw_rows[0].len();
114            let headers: Vec<String> = (0..n_cols).map(|i| format!("col_{i}")).collect();
115            (headers, &raw_rows[..])
116        };
117
118        if data_rows.is_empty() {
119            return Ok(DataFrame::new());
120        }
121
122        let n_cols = headers.len();
123        let n_rows = data_rows.len();
124
125        // Transpose to column-major raw strings
126        let mut raw_columns: Vec<Vec<String>> = vec![Vec::with_capacity(n_rows); n_cols];
127        for (line_idx, row) in data_rows.iter().enumerate() {
128            if row.len() != n_cols {
129                return Err(InsightError::CsvParse {
130                    line: if self.has_header {
131                        line_idx + 2
132                    } else {
133                        line_idx + 1
134                    },
135                    message: format!("expected {n_cols} fields, got {}", row.len()),
136                });
137            }
138            for (col_idx, field) in row.iter().enumerate() {
139                raw_columns[col_idx].push(field.clone());
140            }
141        }
142
143        // Infer types and build columns
144        let mut df = DataFrame::new();
145        for (col_idx, raw_col) in raw_columns.iter().enumerate() {
146            let col = self.build_column(raw_col);
147            df.add_column(headers[col_idx].clone(), col)
148                .expect("all columns same length");
149        }
150
151        Ok(df)
152    }
153
154    /// Parses a CSV file from disk into a DataFrame.
155    pub fn parse_file(&self, path: &str) -> Result<DataFrame, InsightError> {
156        let content = std::fs::read_to_string(path)?;
157        self.parse_str(&content)
158    }
159
160    // ── Internal parsing ─────────────────────────────────────────
161
162    /// Parses raw CSV text into rows of string fields.
163    fn parse_raw(&self, input: &str) -> Result<Vec<Vec<String>>, InsightError> {
164        let delim = self.delimiter as char;
165        let mut rows: Vec<Vec<String>> = Vec::new();
166        let mut current_row: Vec<String> = Vec::new();
167        let mut current_field = String::new();
168        let mut in_quotes = false;
169        let mut chars = input.chars().peekable();
170        let mut _line_num: usize = 1;
171
172        while let Some(c) = chars.next() {
173            if in_quotes {
174                if c == '"' {
175                    if chars.peek() == Some(&'"') {
176                        // Escaped quote ""
177                        chars.next();
178                        current_field.push('"');
179                    } else {
180                        // End of quoted field
181                        in_quotes = false;
182                    }
183                } else {
184                    if c == '\n' {
185                        _line_num += 1;
186                    }
187                    current_field.push(c);
188                }
189            } else if c == '"' && current_field.is_empty() {
190                in_quotes = true;
191            } else if c == delim {
192                current_row.push(std::mem::take(&mut current_field));
193            } else if c == '\n' {
194                // Handle \r\n: strip trailing \r from field
195                let field = if current_field.ends_with('\r') {
196                    current_field.truncate(current_field.len() - 1);
197                    std::mem::take(&mut current_field)
198                } else {
199                    std::mem::take(&mut current_field)
200                };
201                current_row.push(field);
202                if !current_row.iter().all(|f| f.is_empty()) || !rows.is_empty() {
203                    rows.push(std::mem::take(&mut current_row));
204                } else {
205                    current_row.clear();
206                }
207                _line_num += 1;
208            } else if c == '\r' {
209                // Standalone \r (old Mac style) - treat as newline
210                if chars.peek() != Some(&'\n') {
211                    current_row.push(std::mem::take(&mut current_field));
212                    if !current_row.iter().all(|f| f.is_empty()) || !rows.is_empty() {
213                        rows.push(std::mem::take(&mut current_row));
214                    } else {
215                        current_row.clear();
216                    }
217                    _line_num += 1;
218                }
219                // If \r\n, the \r is just ignored; \n handles the newline
220            } else {
221                current_field.push(c);
222            }
223        }
224
225        // Handle last field/row (no trailing newline)
226        if !current_field.is_empty() || !current_row.is_empty() {
227            current_row.push(current_field);
228            rows.push(current_row);
229        }
230
231        // Remove trailing empty rows
232        while rows.last().is_some_and(|r| r.iter().all(|f| f.is_empty())) {
233            rows.pop();
234        }
235
236        Ok(rows)
237    }
238
239    /// Checks if a trimmed value is a null marker.
240    fn is_null(&self, value: &str) -> bool {
241        let trimmed = value.trim();
242        self.null_markers.iter().any(|m| m == trimmed)
243    }
244
245    /// Infers the column type and builds a typed Column.
246    fn build_column(&self, raw_values: &[String]) -> Column {
247        let n = raw_values.len();
248        let trimmed: Vec<&str> = raw_values.iter().map(|s| s.trim()).collect();
249        let null_flags: Vec<bool> = trimmed.iter().map(|s| self.is_null(s)).collect();
250
251        // Count non-null values
252        let non_null_count = null_flags.iter().filter(|&&is_null| !is_null).count();
253        if non_null_count == 0 {
254            // All null: default to numeric
255            return Column::numeric(vec![0.0; n], ValidityBitmap::all_invalid(n));
256        }
257
258        // Try numeric
259        let inferred = self.try_infer_type(&trimmed, &null_flags);
260
261        match inferred {
262            DataType::Numeric => self.build_numeric_column(&trimmed, &null_flags),
263            DataType::Boolean => self.build_boolean_column(&trimmed, &null_flags),
264            DataType::Categorical => self.build_categorical_column(&trimmed, &null_flags),
265            DataType::Text => self.build_text_column(&trimmed, &null_flags),
266        }
267    }
268
269    /// Determines the most specific type that fits all non-null values.
270    fn try_infer_type(&self, values: &[&str], null_flags: &[bool]) -> DataType {
271        let non_null: Vec<&str> = values
272            .iter()
273            .zip(null_flags.iter())
274            .filter(|(_, &is_null)| !is_null)
275            .map(|(&v, _)| v)
276            .collect();
277
278        // Try numeric
279        if non_null.iter().all(|s| s.parse::<f64>().is_ok()) {
280            return DataType::Numeric;
281        }
282
283        // Try boolean
284        if non_null.iter().all(|s| is_boolean_str(s)) {
285            return DataType::Boolean;
286        }
287
288        // Categorical vs Text: based on cardinality
289        let mut unique = std::collections::HashSet::new();
290        for &v in &non_null {
291            unique.insert(v);
292        }
293        let ratio = unique.len() as f64 / non_null.len() as f64;
294        if ratio < CATEGORICAL_THRESHOLD && unique.len() <= MAX_CATEGORICAL_UNIQUE {
295            DataType::Categorical
296        } else {
297            DataType::Text
298        }
299    }
300
301    fn build_numeric_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
302        let n = values.len();
303        let mut nums = Vec::with_capacity(n);
304        let mut validity = ValidityBitmap::empty();
305
306        for (i, &val) in values.iter().enumerate() {
307            if null_flags[i] {
308                nums.push(0.0);
309                validity.push(false);
310            } else {
311                nums.push(val.parse::<f64>().unwrap_or(0.0));
312                validity.push(true);
313            }
314        }
315
316        Column::numeric(nums, validity)
317    }
318
319    fn build_boolean_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
320        let n = values.len();
321        let mut bools = Vec::with_capacity(n);
322        let mut validity = ValidityBitmap::empty();
323
324        for (i, &val) in values.iter().enumerate() {
325            if null_flags[i] {
326                bools.push(false);
327                validity.push(false);
328            } else {
329                bools.push(parse_boolean_str(val));
330                validity.push(true);
331            }
332        }
333
334        Column::boolean(bools, validity)
335    }
336
337    fn build_categorical_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
338        let n = values.len();
339        let mut dict_map: HashMap<String, u32> = HashMap::new();
340        let mut dictionary: Vec<String> = Vec::new();
341        let mut indices = Vec::with_capacity(n);
342        let mut validity = ValidityBitmap::empty();
343
344        for (i, &val) in values.iter().enumerate() {
345            if null_flags[i] {
346                indices.push(0);
347                validity.push(false);
348            } else {
349                let idx = if let Some(&existing) = dict_map.get(val) {
350                    existing
351                } else {
352                    let idx = dictionary.len() as u32;
353                    dictionary.push(val.to_string());
354                    dict_map.insert(val.to_string(), idx);
355                    idx
356                };
357                indices.push(idx);
358                validity.push(true);
359            }
360        }
361
362        Column::categorical(dictionary, indices, validity)
363    }
364
365    fn build_text_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
366        let n = values.len();
367        let mut texts = Vec::with_capacity(n);
368        let mut validity = ValidityBitmap::empty();
369
370        for (i, &val) in values.iter().enumerate() {
371            if null_flags[i] {
372                texts.push(String::new());
373                validity.push(false);
374            } else {
375                texts.push(val.to_string());
376                validity.push(true);
377            }
378        }
379
380        Column::text(texts, validity)
381    }
382}
383
384impl Default for CsvParser {
385    fn default() -> Self {
386        Self::new()
387    }
388}
389
390// ── Helper functions ──────────────────────────────────────────────────
391
392/// Checks if a string represents a boolean value.
393fn is_boolean_str(s: &str) -> bool {
394    matches!(
395        s.to_lowercase().as_str(),
396        "true" | "false" | "yes" | "no" | "t" | "f" | "y" | "n"
397    )
398}
399
400/// Parses a boolean string to `bool`.
401fn parse_boolean_str(s: &str) -> bool {
402    matches!(s.to_lowercase().as_str(), "true" | "yes" | "t" | "y")
403}
404
405// ── Tests ─────────────────────────────────────────────────────────────
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410
411    // ── Basic CSV parsing ────────────────────────────────────────
412
413    #[test]
414    fn parse_simple_csv() {
415        let csv = "a,b,c\n1,2,3\n4,5,6\n";
416        let df = CsvParser::new().parse_str(csv).unwrap();
417        assert_eq!(df.row_count(), 2);
418        assert_eq!(df.column_count(), 3);
419        assert_eq!(df.column_names(), &["a", "b", "c"]);
420    }
421
422    #[test]
423    fn parse_numeric_columns() {
424        let csv = "x,y\n1.5,2.7\n3.1,-4.2\n0,100\n";
425        let df = CsvParser::new().parse_str(csv).unwrap();
426        let x = df.column_by_name("x").unwrap();
427        assert_eq!(x.data_type(), DataType::Numeric);
428        assert_eq!(x.as_numeric().unwrap(), &[1.5, 3.1, 0.0]);
429    }
430
431    #[test]
432    fn parse_boolean_column() {
433        let csv = "flag\ntrue\nfalse\nyes\nno\n";
434        let df = CsvParser::new().parse_str(csv).unwrap();
435        let flag = df.column_by_name("flag").unwrap();
436        assert_eq!(flag.data_type(), DataType::Boolean);
437        assert_eq!(flag.as_boolean().unwrap(), &[true, false, true, false]);
438    }
439
440    #[test]
441    fn parse_categorical_column() {
442        // 3 unique values / 7 rows = 0.43 < 0.5 → categorical
443        let csv = "status\nA\nB\nC\nA\nB\nA\nC\n";
444        let df = CsvParser::new().parse_str(csv).unwrap();
445        let status = df.column_by_name("status").unwrap();
446        assert_eq!(status.data_type(), DataType::Categorical);
447        assert_eq!(status.category_at(0), Some("A"));
448        assert_eq!(status.category_at(2), Some("C"));
449        assert_eq!(status.category_at(5), Some("A"));
450    }
451
452    #[test]
453    fn parse_text_column() {
454        // High cardinality: all unique values
455        let csv = "name\nAlice\nBob\nCharlie\nDave\nEve\n";
456        let df = CsvParser::new().parse_str(csv).unwrap();
457        let name = df.column_by_name("name").unwrap();
458        assert_eq!(name.data_type(), DataType::Text);
459        assert_eq!(name.text_at(0), Some("Alice"));
460    }
461
462    #[test]
463    fn parse_mixed_types() {
464        // 2 unique categories / 5 rows = 0.4 < 0.5 → categorical
465        let csv = "id,value,active,category\n1,10.5,true,A\n2,20.3,false,B\n3,30.1,true,A\n4,40.0,false,B\n5,50.5,true,A\n";
466        let df = CsvParser::new().parse_str(csv).unwrap();
467        assert_eq!(
468            df.column_by_name("id").unwrap().data_type(),
469            DataType::Numeric
470        );
471        assert_eq!(
472            df.column_by_name("value").unwrap().data_type(),
473            DataType::Numeric
474        );
475        assert_eq!(
476            df.column_by_name("active").unwrap().data_type(),
477            DataType::Boolean
478        );
479        assert_eq!(
480            df.column_by_name("category").unwrap().data_type(),
481            DataType::Categorical
482        );
483    }
484
485    // ── Null handling ────────────────────────────────────────────
486
487    #[test]
488    fn parse_null_markers() {
489        let csv = "x\n1.0\nNA\n3.0\n\n5.0\nnull\n";
490        let df = CsvParser::new().parse_str(csv).unwrap();
491        let x = df.column_by_name("x").unwrap();
492        assert_eq!(x.data_type(), DataType::Numeric);
493        assert_eq!(x.null_count(), 3); // NA, empty, null
494        assert!(x.is_valid(0));
495        assert!(!x.is_valid(1));
496        assert!(x.is_valid(2));
497        assert!(!x.is_valid(3));
498        assert!(x.is_valid(4));
499        assert!(!x.is_valid(5));
500    }
501
502    #[test]
503    fn all_null_column() {
504        let csv = "x\nNA\n\nnull\n";
505        let df = CsvParser::new().parse_str(csv).unwrap();
506        let x = df.column_by_name("x").unwrap();
507        assert_eq!(x.data_type(), DataType::Numeric); // defaults to numeric
508        assert_eq!(x.null_count(), 3);
509    }
510
511    #[test]
512    fn nan_marker_as_null() {
513        let csv = "x\n1.0\nNaN\n3.0\n";
514        let df = CsvParser::new().parse_str(csv).unwrap();
515        let x = df.column_by_name("x").unwrap();
516        assert_eq!(x.null_count(), 1); // NaN treated as null
517        assert!(!x.is_valid(1));
518    }
519
520    // ── Quoted fields ────────────────────────────────────────────
521
522    #[test]
523    fn parse_quoted_fields() {
524        let csv = "name,desc\nAlice,\"hello, world\"\nBob,\"she said \"\"hi\"\"\"\n";
525        let df = CsvParser::new().parse_str(csv).unwrap();
526        let desc = df.column_by_name("desc").unwrap();
527        assert_eq!(desc.text_at(0), Some("hello, world"));
528        assert_eq!(desc.text_at(1), Some("she said \"hi\""));
529    }
530
531    #[test]
532    fn parse_quoted_newlines() {
533        let csv = "name,note\nAlice,\"line1\nline2\"\nBob,simple\n";
534        let df = CsvParser::new().parse_str(csv).unwrap();
535        assert_eq!(df.row_count(), 2);
536        let note = df.column_by_name("note").unwrap();
537        assert_eq!(note.text_at(0), Some("line1\nline2"));
538        assert_eq!(note.text_at(1), Some("simple"));
539    }
540
541    // ── Edge cases ───────────────────────────────────────────────
542
543    #[test]
544    fn parse_crlf_line_endings() {
545        let csv = "a,b\r\n1,2\r\n3,4\r\n";
546        let df = CsvParser::new().parse_str(csv).unwrap();
547        assert_eq!(df.row_count(), 2);
548        let a = df.column_by_name("a").unwrap();
549        assert_eq!(a.as_numeric().unwrap(), &[1.0, 3.0]);
550    }
551
552    #[test]
553    fn parse_no_trailing_newline() {
554        let csv = "x\n1\n2\n3";
555        let df = CsvParser::new().parse_str(csv).unwrap();
556        assert_eq!(df.row_count(), 3);
557    }
558
559    #[test]
560    fn parse_bom() {
561        let csv = "\u{feff}x,y\n1,2\n";
562        let df = CsvParser::new().parse_str(csv).unwrap();
563        assert_eq!(df.column_names(), &["x", "y"]);
564    }
565
566    #[test]
567    fn parse_empty_csv() {
568        let csv = "";
569        let df = CsvParser::new().parse_str(csv).unwrap();
570        assert_eq!(df.row_count(), 0);
571        assert_eq!(df.column_count(), 0);
572    }
573
574    #[test]
575    fn parse_header_only() {
576        let csv = "a,b,c\n";
577        let df = CsvParser::new().parse_str(csv).unwrap();
578        assert_eq!(df.row_count(), 0);
579        assert_eq!(df.column_count(), 0);
580    }
581
582    #[test]
583    fn parse_column_count_mismatch_error() {
584        let csv = "a,b\n1,2\n3\n";
585        let result = CsvParser::new().parse_str(csv);
586        assert!(result.is_err());
587    }
588
589    #[test]
590    fn parse_without_header() {
591        let csv = "1,2\n3,4\n";
592        let df = CsvParser::new().has_header(false).parse_str(csv).unwrap();
593        assert_eq!(df.row_count(), 2);
594        assert_eq!(df.column_names(), &["col_0", "col_1"]);
595    }
596
597    #[test]
598    fn parse_tab_delimiter() {
599        let csv = "a\tb\n1\t2\n3\t4\n";
600        let df = CsvParser::new().delimiter(b'\t').parse_str(csv).unwrap();
601        assert_eq!(df.row_count(), 2);
602        assert_eq!(df.column_names(), &["a", "b"]);
603    }
604
605    #[test]
606    fn parse_semicolon_delimiter() {
607        let csv = "a;b\n1;2\n3;4\n";
608        let df = CsvParser::new().delimiter(b';').parse_str(csv).unwrap();
609        assert_eq!(df.row_count(), 2);
610    }
611
612    // ── Type inference edge cases ────────────────────────────────
613
614    #[test]
615    fn numeric_with_leading_spaces() {
616        let csv = "x\n  1.5  \n  2.3  \n";
617        let df = CsvParser::new().parse_str(csv).unwrap();
618        let x = df.column_by_name("x").unwrap();
619        assert_eq!(x.data_type(), DataType::Numeric);
620        assert_eq!(x.as_numeric().unwrap(), &[1.5, 2.3]);
621    }
622
623    #[test]
624    fn single_non_numeric_demotes_to_text() {
625        let csv = "x\n1\n2\nthree\n4\n";
626        let df = CsvParser::new().parse_str(csv).unwrap();
627        let x = df.column_by_name("x").unwrap();
628        // Cannot be numeric because "three" doesn't parse as f64
629        assert_ne!(x.data_type(), DataType::Numeric);
630    }
631
632    #[test]
633    fn categorical_vs_text_threshold() {
634        // 2 unique values / 4 rows = 0.5 → exactly at threshold (not categorical)
635        // Actually 2/4 = 0.5 which is NOT < 0.5, so it's text
636        let csv = "x\nA\nB\nA\nB\n";
637        let df = CsvParser::new().parse_str(csv).unwrap();
638        let x = df.column_by_name("x").unwrap();
639        // 2/4 = 0.5, threshold is < 0.5, so this is Text
640        assert_eq!(x.data_type(), DataType::Text);
641    }
642
643    #[test]
644    fn categorical_below_threshold() {
645        // 2 unique values / 5 rows = 0.4 → categorical
646        let csv = "x\nA\nB\nA\nB\nA\n";
647        let df = CsvParser::new().parse_str(csv).unwrap();
648        let x = df.column_by_name("x").unwrap();
649        assert_eq!(x.data_type(), DataType::Categorical);
650    }
651
652    #[test]
653    fn boolean_mixed_formats() {
654        let csv = "x\ntrue\nFalse\nYes\nno\nT\nf\n";
655        let df = CsvParser::new().parse_str(csv).unwrap();
656        let x = df.column_by_name("x").unwrap();
657        assert_eq!(x.data_type(), DataType::Boolean);
658        assert_eq!(
659            x.as_boolean().unwrap(),
660            &[true, false, true, false, true, false]
661        );
662    }
663
664    #[test]
665    fn boolean_with_nulls() {
666        let csv = "x\ntrue\nNA\nfalse\n";
667        let df = CsvParser::new().parse_str(csv).unwrap();
668        let x = df.column_by_name("x").unwrap();
669        assert_eq!(x.data_type(), DataType::Boolean);
670        assert_eq!(x.null_count(), 1);
671        assert!(!x.is_valid(1));
672    }
673
674    #[test]
675    fn negative_and_scientific_notation() {
676        let csv = "x\n-1.5\n2.3e10\n-4.5E-3\n";
677        let df = CsvParser::new().parse_str(csv).unwrap();
678        let x = df.column_by_name("x").unwrap();
679        assert_eq!(x.data_type(), DataType::Numeric);
680        assert_eq!(x.as_numeric().unwrap()[0], -1.5);
681        assert!((x.as_numeric().unwrap()[1] - 2.3e10).abs() < 1.0);
682        assert!((x.as_numeric().unwrap()[2] - (-4.5e-3)).abs() < 1e-10);
683    }
684
685    // ── Custom null markers ──────────────────────────────────────
686
687    #[test]
688    fn custom_null_markers() {
689        let csv = "x\n1.0\n-999\n3.0\n";
690        let df = CsvParser::new()
691            .null_markers(vec!["-999".to_string()])
692            .parse_str(csv)
693            .unwrap();
694        let x = df.column_by_name("x").unwrap();
695        assert_eq!(x.null_count(), 1);
696        assert!(!x.is_valid(1));
697    }
698}