1use crate::dataframe::{Column, DataFrame, DataType, ValidityBitmap};
31use crate::error::InsightError;
32use std::collections::HashMap;
33
34const DEFAULT_NULL_MARKERS: &[&str] = &[
36 "", "NA", "N/A", "na", "n/a", "null", "NULL", "None", "none", ".", "NaN", "nan", "NAN", "#N/A",
37 "#NA",
38];
39
40const CATEGORICAL_THRESHOLD: f64 = 0.5;
43
44const MAX_CATEGORICAL_UNIQUE: usize = 1000;
46
47#[derive(Debug, Clone)]
57pub struct CsvParser {
58 delimiter: u8,
59 has_header: bool,
60 null_markers: Vec<String>,
61}
62
63impl CsvParser {
64 pub fn new() -> Self {
66 Self {
67 delimiter: b',',
68 has_header: true,
69 null_markers: DEFAULT_NULL_MARKERS
70 .iter()
71 .map(|s| (*s).to_string())
72 .collect(),
73 }
74 }
75
76 pub fn delimiter(mut self, delim: u8) -> Self {
78 self.delimiter = delim;
79 self
80 }
81
82 pub fn has_header(mut self, header: bool) -> Self {
84 self.has_header = header;
85 self
86 }
87
88 pub fn null_markers(mut self, markers: Vec<String>) -> Self {
90 self.null_markers = markers;
91 self
92 }
93
94 pub fn parse_str(&self, input: &str) -> Result<DataFrame, InsightError> {
96 let input = input.strip_prefix('\u{feff}').unwrap_or(input);
98
99 let raw_rows = self.parse_raw(input)?;
101 if raw_rows.is_empty() {
102 return Ok(DataFrame::new());
103 }
104
105 let (headers, data_rows) = if self.has_header {
107 if raw_rows.is_empty() {
108 return Ok(DataFrame::new());
109 }
110 let headers: Vec<String> = raw_rows[0].clone();
111 (headers, &raw_rows[1..])
112 } else {
113 let n_cols = raw_rows[0].len();
114 let headers: Vec<String> = (0..n_cols).map(|i| format!("col_{i}")).collect();
115 (headers, &raw_rows[..])
116 };
117
118 if data_rows.is_empty() {
119 return Ok(DataFrame::new());
120 }
121
122 let n_cols = headers.len();
123 let n_rows = data_rows.len();
124
125 let mut raw_columns: Vec<Vec<String>> = vec![Vec::with_capacity(n_rows); n_cols];
127 for (line_idx, row) in data_rows.iter().enumerate() {
128 if row.len() != n_cols {
129 return Err(InsightError::CsvParse {
130 line: if self.has_header {
131 line_idx + 2
132 } else {
133 line_idx + 1
134 },
135 message: format!("expected {n_cols} fields, got {}", row.len()),
136 });
137 }
138 for (col_idx, field) in row.iter().enumerate() {
139 raw_columns[col_idx].push(field.clone());
140 }
141 }
142
143 let mut df = DataFrame::new();
145 for (col_idx, raw_col) in raw_columns.iter().enumerate() {
146 let col = self.build_column(raw_col);
147 df.add_column(headers[col_idx].clone(), col)
148 .expect("all columns same length");
149 }
150
151 Ok(df)
152 }
153
154 pub fn parse_file(&self, path: &str) -> Result<DataFrame, InsightError> {
156 let content = std::fs::read_to_string(path)?;
157 self.parse_str(&content)
158 }
159
160 fn parse_raw(&self, input: &str) -> Result<Vec<Vec<String>>, InsightError> {
164 let delim = self.delimiter as char;
165 let mut rows: Vec<Vec<String>> = Vec::new();
166 let mut current_row: Vec<String> = Vec::new();
167 let mut current_field = String::new();
168 let mut in_quotes = false;
169 let mut chars = input.chars().peekable();
170 let mut _line_num: usize = 1;
171
172 while let Some(c) = chars.next() {
173 if in_quotes {
174 if c == '"' {
175 if chars.peek() == Some(&'"') {
176 chars.next();
178 current_field.push('"');
179 } else {
180 in_quotes = false;
182 }
183 } else {
184 if c == '\n' {
185 _line_num += 1;
186 }
187 current_field.push(c);
188 }
189 } else if c == '"' && current_field.is_empty() {
190 in_quotes = true;
191 } else if c == delim {
192 current_row.push(std::mem::take(&mut current_field));
193 } else if c == '\n' {
194 let field = if current_field.ends_with('\r') {
196 current_field.truncate(current_field.len() - 1);
197 std::mem::take(&mut current_field)
198 } else {
199 std::mem::take(&mut current_field)
200 };
201 current_row.push(field);
202 if !current_row.iter().all(|f| f.is_empty()) || !rows.is_empty() {
203 rows.push(std::mem::take(&mut current_row));
204 } else {
205 current_row.clear();
206 }
207 _line_num += 1;
208 } else if c == '\r' {
209 if chars.peek() != Some(&'\n') {
211 current_row.push(std::mem::take(&mut current_field));
212 if !current_row.iter().all(|f| f.is_empty()) || !rows.is_empty() {
213 rows.push(std::mem::take(&mut current_row));
214 } else {
215 current_row.clear();
216 }
217 _line_num += 1;
218 }
219 } else {
221 current_field.push(c);
222 }
223 }
224
225 if !current_field.is_empty() || !current_row.is_empty() {
227 current_row.push(current_field);
228 rows.push(current_row);
229 }
230
231 while rows.last().is_some_and(|r| r.iter().all(|f| f.is_empty())) {
233 rows.pop();
234 }
235
236 Ok(rows)
237 }
238
239 fn is_null(&self, value: &str) -> bool {
241 let trimmed = value.trim();
242 self.null_markers.iter().any(|m| m == trimmed)
243 }
244
245 fn build_column(&self, raw_values: &[String]) -> Column {
247 let n = raw_values.len();
248 let trimmed: Vec<&str> = raw_values.iter().map(|s| s.trim()).collect();
249 let null_flags: Vec<bool> = trimmed.iter().map(|s| self.is_null(s)).collect();
250
251 let non_null_count = null_flags.iter().filter(|&&is_null| !is_null).count();
253 if non_null_count == 0 {
254 return Column::numeric(vec![0.0; n], ValidityBitmap::all_invalid(n));
256 }
257
258 let inferred = self.try_infer_type(&trimmed, &null_flags);
260
261 match inferred {
262 DataType::Numeric => self.build_numeric_column(&trimmed, &null_flags),
263 DataType::Boolean => self.build_boolean_column(&trimmed, &null_flags),
264 DataType::Categorical => self.build_categorical_column(&trimmed, &null_flags),
265 DataType::Text => self.build_text_column(&trimmed, &null_flags),
266 }
267 }
268
269 fn try_infer_type(&self, values: &[&str], null_flags: &[bool]) -> DataType {
271 let non_null: Vec<&str> = values
272 .iter()
273 .zip(null_flags.iter())
274 .filter(|(_, &is_null)| !is_null)
275 .map(|(&v, _)| v)
276 .collect();
277
278 if non_null.iter().all(|s| s.parse::<f64>().is_ok()) {
280 return DataType::Numeric;
281 }
282
283 if non_null.iter().all(|s| is_boolean_str(s)) {
285 return DataType::Boolean;
286 }
287
288 let mut unique = std::collections::HashSet::new();
290 for &v in &non_null {
291 unique.insert(v);
292 }
293 let ratio = unique.len() as f64 / non_null.len() as f64;
294 if ratio < CATEGORICAL_THRESHOLD && unique.len() <= MAX_CATEGORICAL_UNIQUE {
295 DataType::Categorical
296 } else {
297 DataType::Text
298 }
299 }
300
301 fn build_numeric_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
302 let n = values.len();
303 let mut nums = Vec::with_capacity(n);
304 let mut validity = ValidityBitmap::empty();
305
306 for (i, &val) in values.iter().enumerate() {
307 if null_flags[i] {
308 nums.push(0.0);
309 validity.push(false);
310 } else {
311 nums.push(val.parse::<f64>().unwrap_or(0.0));
312 validity.push(true);
313 }
314 }
315
316 Column::numeric(nums, validity)
317 }
318
319 fn build_boolean_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
320 let n = values.len();
321 let mut bools = Vec::with_capacity(n);
322 let mut validity = ValidityBitmap::empty();
323
324 for (i, &val) in values.iter().enumerate() {
325 if null_flags[i] {
326 bools.push(false);
327 validity.push(false);
328 } else {
329 bools.push(parse_boolean_str(val));
330 validity.push(true);
331 }
332 }
333
334 Column::boolean(bools, validity)
335 }
336
337 fn build_categorical_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
338 let n = values.len();
339 let mut dict_map: HashMap<String, u32> = HashMap::new();
340 let mut dictionary: Vec<String> = Vec::new();
341 let mut indices = Vec::with_capacity(n);
342 let mut validity = ValidityBitmap::empty();
343
344 for (i, &val) in values.iter().enumerate() {
345 if null_flags[i] {
346 indices.push(0);
347 validity.push(false);
348 } else {
349 let idx = if let Some(&existing) = dict_map.get(val) {
350 existing
351 } else {
352 let idx = dictionary.len() as u32;
353 dictionary.push(val.to_string());
354 dict_map.insert(val.to_string(), idx);
355 idx
356 };
357 indices.push(idx);
358 validity.push(true);
359 }
360 }
361
362 Column::categorical(dictionary, indices, validity)
363 }
364
365 fn build_text_column(&self, values: &[&str], null_flags: &[bool]) -> Column {
366 let n = values.len();
367 let mut texts = Vec::with_capacity(n);
368 let mut validity = ValidityBitmap::empty();
369
370 for (i, &val) in values.iter().enumerate() {
371 if null_flags[i] {
372 texts.push(String::new());
373 validity.push(false);
374 } else {
375 texts.push(val.to_string());
376 validity.push(true);
377 }
378 }
379
380 Column::text(texts, validity)
381 }
382}
383
384impl Default for CsvParser {
385 fn default() -> Self {
386 Self::new()
387 }
388}
389
390fn is_boolean_str(s: &str) -> bool {
394 matches!(
395 s.to_lowercase().as_str(),
396 "true" | "false" | "yes" | "no" | "t" | "f" | "y" | "n"
397 )
398}
399
400fn parse_boolean_str(s: &str) -> bool {
402 matches!(s.to_lowercase().as_str(), "true" | "yes" | "t" | "y")
403}
404
405#[cfg(test)]
408mod tests {
409 use super::*;
410
411 #[test]
414 fn parse_simple_csv() {
415 let csv = "a,b,c\n1,2,3\n4,5,6\n";
416 let df = CsvParser::new().parse_str(csv).unwrap();
417 assert_eq!(df.row_count(), 2);
418 assert_eq!(df.column_count(), 3);
419 assert_eq!(df.column_names(), &["a", "b", "c"]);
420 }
421
422 #[test]
423 fn parse_numeric_columns() {
424 let csv = "x,y\n1.5,2.7\n3.1,-4.2\n0,100\n";
425 let df = CsvParser::new().parse_str(csv).unwrap();
426 let x = df.column_by_name("x").unwrap();
427 assert_eq!(x.data_type(), DataType::Numeric);
428 assert_eq!(x.as_numeric().unwrap(), &[1.5, 3.1, 0.0]);
429 }
430
431 #[test]
432 fn parse_boolean_column() {
433 let csv = "flag\ntrue\nfalse\nyes\nno\n";
434 let df = CsvParser::new().parse_str(csv).unwrap();
435 let flag = df.column_by_name("flag").unwrap();
436 assert_eq!(flag.data_type(), DataType::Boolean);
437 assert_eq!(flag.as_boolean().unwrap(), &[true, false, true, false]);
438 }
439
440 #[test]
441 fn parse_categorical_column() {
442 let csv = "status\nA\nB\nC\nA\nB\nA\nC\n";
444 let df = CsvParser::new().parse_str(csv).unwrap();
445 let status = df.column_by_name("status").unwrap();
446 assert_eq!(status.data_type(), DataType::Categorical);
447 assert_eq!(status.category_at(0), Some("A"));
448 assert_eq!(status.category_at(2), Some("C"));
449 assert_eq!(status.category_at(5), Some("A"));
450 }
451
452 #[test]
453 fn parse_text_column() {
454 let csv = "name\nAlice\nBob\nCharlie\nDave\nEve\n";
456 let df = CsvParser::new().parse_str(csv).unwrap();
457 let name = df.column_by_name("name").unwrap();
458 assert_eq!(name.data_type(), DataType::Text);
459 assert_eq!(name.text_at(0), Some("Alice"));
460 }
461
462 #[test]
463 fn parse_mixed_types() {
464 let csv = "id,value,active,category\n1,10.5,true,A\n2,20.3,false,B\n3,30.1,true,A\n4,40.0,false,B\n5,50.5,true,A\n";
466 let df = CsvParser::new().parse_str(csv).unwrap();
467 assert_eq!(
468 df.column_by_name("id").unwrap().data_type(),
469 DataType::Numeric
470 );
471 assert_eq!(
472 df.column_by_name("value").unwrap().data_type(),
473 DataType::Numeric
474 );
475 assert_eq!(
476 df.column_by_name("active").unwrap().data_type(),
477 DataType::Boolean
478 );
479 assert_eq!(
480 df.column_by_name("category").unwrap().data_type(),
481 DataType::Categorical
482 );
483 }
484
485 #[test]
488 fn parse_null_markers() {
489 let csv = "x\n1.0\nNA\n3.0\n\n5.0\nnull\n";
490 let df = CsvParser::new().parse_str(csv).unwrap();
491 let x = df.column_by_name("x").unwrap();
492 assert_eq!(x.data_type(), DataType::Numeric);
493 assert_eq!(x.null_count(), 3); assert!(x.is_valid(0));
495 assert!(!x.is_valid(1));
496 assert!(x.is_valid(2));
497 assert!(!x.is_valid(3));
498 assert!(x.is_valid(4));
499 assert!(!x.is_valid(5));
500 }
501
502 #[test]
503 fn all_null_column() {
504 let csv = "x\nNA\n\nnull\n";
505 let df = CsvParser::new().parse_str(csv).unwrap();
506 let x = df.column_by_name("x").unwrap();
507 assert_eq!(x.data_type(), DataType::Numeric); assert_eq!(x.null_count(), 3);
509 }
510
511 #[test]
512 fn nan_marker_as_null() {
513 let csv = "x\n1.0\nNaN\n3.0\n";
514 let df = CsvParser::new().parse_str(csv).unwrap();
515 let x = df.column_by_name("x").unwrap();
516 assert_eq!(x.null_count(), 1); assert!(!x.is_valid(1));
518 }
519
520 #[test]
523 fn parse_quoted_fields() {
524 let csv = "name,desc\nAlice,\"hello, world\"\nBob,\"she said \"\"hi\"\"\"\n";
525 let df = CsvParser::new().parse_str(csv).unwrap();
526 let desc = df.column_by_name("desc").unwrap();
527 assert_eq!(desc.text_at(0), Some("hello, world"));
528 assert_eq!(desc.text_at(1), Some("she said \"hi\""));
529 }
530
531 #[test]
532 fn parse_quoted_newlines() {
533 let csv = "name,note\nAlice,\"line1\nline2\"\nBob,simple\n";
534 let df = CsvParser::new().parse_str(csv).unwrap();
535 assert_eq!(df.row_count(), 2);
536 let note = df.column_by_name("note").unwrap();
537 assert_eq!(note.text_at(0), Some("line1\nline2"));
538 assert_eq!(note.text_at(1), Some("simple"));
539 }
540
541 #[test]
544 fn parse_crlf_line_endings() {
545 let csv = "a,b\r\n1,2\r\n3,4\r\n";
546 let df = CsvParser::new().parse_str(csv).unwrap();
547 assert_eq!(df.row_count(), 2);
548 let a = df.column_by_name("a").unwrap();
549 assert_eq!(a.as_numeric().unwrap(), &[1.0, 3.0]);
550 }
551
552 #[test]
553 fn parse_no_trailing_newline() {
554 let csv = "x\n1\n2\n3";
555 let df = CsvParser::new().parse_str(csv).unwrap();
556 assert_eq!(df.row_count(), 3);
557 }
558
559 #[test]
560 fn parse_bom() {
561 let csv = "\u{feff}x,y\n1,2\n";
562 let df = CsvParser::new().parse_str(csv).unwrap();
563 assert_eq!(df.column_names(), &["x", "y"]);
564 }
565
566 #[test]
567 fn parse_empty_csv() {
568 let csv = "";
569 let df = CsvParser::new().parse_str(csv).unwrap();
570 assert_eq!(df.row_count(), 0);
571 assert_eq!(df.column_count(), 0);
572 }
573
574 #[test]
575 fn parse_header_only() {
576 let csv = "a,b,c\n";
577 let df = CsvParser::new().parse_str(csv).unwrap();
578 assert_eq!(df.row_count(), 0);
579 assert_eq!(df.column_count(), 0);
580 }
581
582 #[test]
583 fn parse_column_count_mismatch_error() {
584 let csv = "a,b\n1,2\n3\n";
585 let result = CsvParser::new().parse_str(csv);
586 assert!(result.is_err());
587 }
588
589 #[test]
590 fn parse_without_header() {
591 let csv = "1,2\n3,4\n";
592 let df = CsvParser::new().has_header(false).parse_str(csv).unwrap();
593 assert_eq!(df.row_count(), 2);
594 assert_eq!(df.column_names(), &["col_0", "col_1"]);
595 }
596
597 #[test]
598 fn parse_tab_delimiter() {
599 let csv = "a\tb\n1\t2\n3\t4\n";
600 let df = CsvParser::new().delimiter(b'\t').parse_str(csv).unwrap();
601 assert_eq!(df.row_count(), 2);
602 assert_eq!(df.column_names(), &["a", "b"]);
603 }
604
605 #[test]
606 fn parse_semicolon_delimiter() {
607 let csv = "a;b\n1;2\n3;4\n";
608 let df = CsvParser::new().delimiter(b';').parse_str(csv).unwrap();
609 assert_eq!(df.row_count(), 2);
610 }
611
612 #[test]
615 fn numeric_with_leading_spaces() {
616 let csv = "x\n 1.5 \n 2.3 \n";
617 let df = CsvParser::new().parse_str(csv).unwrap();
618 let x = df.column_by_name("x").unwrap();
619 assert_eq!(x.data_type(), DataType::Numeric);
620 assert_eq!(x.as_numeric().unwrap(), &[1.5, 2.3]);
621 }
622
623 #[test]
624 fn single_non_numeric_demotes_to_text() {
625 let csv = "x\n1\n2\nthree\n4\n";
626 let df = CsvParser::new().parse_str(csv).unwrap();
627 let x = df.column_by_name("x").unwrap();
628 assert_ne!(x.data_type(), DataType::Numeric);
630 }
631
632 #[test]
633 fn categorical_vs_text_threshold() {
634 let csv = "x\nA\nB\nA\nB\n";
637 let df = CsvParser::new().parse_str(csv).unwrap();
638 let x = df.column_by_name("x").unwrap();
639 assert_eq!(x.data_type(), DataType::Text);
641 }
642
643 #[test]
644 fn categorical_below_threshold() {
645 let csv = "x\nA\nB\nA\nB\nA\n";
647 let df = CsvParser::new().parse_str(csv).unwrap();
648 let x = df.column_by_name("x").unwrap();
649 assert_eq!(x.data_type(), DataType::Categorical);
650 }
651
652 #[test]
653 fn boolean_mixed_formats() {
654 let csv = "x\ntrue\nFalse\nYes\nno\nT\nf\n";
655 let df = CsvParser::new().parse_str(csv).unwrap();
656 let x = df.column_by_name("x").unwrap();
657 assert_eq!(x.data_type(), DataType::Boolean);
658 assert_eq!(
659 x.as_boolean().unwrap(),
660 &[true, false, true, false, true, false]
661 );
662 }
663
664 #[test]
665 fn boolean_with_nulls() {
666 let csv = "x\ntrue\nNA\nfalse\n";
667 let df = CsvParser::new().parse_str(csv).unwrap();
668 let x = df.column_by_name("x").unwrap();
669 assert_eq!(x.data_type(), DataType::Boolean);
670 assert_eq!(x.null_count(), 1);
671 assert!(!x.is_valid(1));
672 }
673
674 #[test]
675 fn negative_and_scientific_notation() {
676 let csv = "x\n-1.5\n2.3e10\n-4.5E-3\n";
677 let df = CsvParser::new().parse_str(csv).unwrap();
678 let x = df.column_by_name("x").unwrap();
679 assert_eq!(x.data_type(), DataType::Numeric);
680 assert_eq!(x.as_numeric().unwrap()[0], -1.5);
681 assert!((x.as_numeric().unwrap()[1] - 2.3e10).abs() < 1.0);
682 assert!((x.as_numeric().unwrap()[2] - (-4.5e-3)).abs() < 1e-10);
683 }
684
685 #[test]
688 fn custom_null_markers() {
689 let csv = "x\n1.0\n-999\n3.0\n";
690 let df = CsvParser::new()
691 .null_markers(vec!["-999".to_string()])
692 .parse_str(csv)
693 .unwrap();
694 let x = df.column_by_name("x").unwrap();
695 assert_eq!(x.null_count(), 1);
696 assert!(!x.is_valid(1));
697 }
698}