hedl_csv/
from_csv.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Convert CSV files to HEDL documents.
19
20use crate::error::{CsvError, Result};
21use hedl_core::{Document, Item, MatrixList, Node, Value};
22use hedl_core::lex::parse_expression_token;
23use hedl_core::lex::parse_tensor;
24use std::io::Read;
25
26/// Default maximum number of rows to prevent memory exhaustion.
27///
28/// This limit prevents Denial-of-Service attacks from maliciously large CSV files.
29/// The default is 1 million rows, which allows processing reasonably large datasets
30/// while preventing unbounded memory allocation.
31///
32/// # Security Considerations
33///
34/// - **Memory exhaustion**: Without a limit, attackers could provide CSV files with
35///   billions of rows, causing the application to allocate excessive memory and crash.
36/// - **Configurable**: The limit can be adjusted via `FromCsvConfig::max_rows` based on
37///   deployment context and available resources.
38/// - **Trade-off**: Higher limits allow larger datasets but increase DoS risk.
39///
40/// # Examples
41///
42/// ```
43/// # use hedl_csv::FromCsvConfig;
44/// // Use default 1M row limit
45/// let config = FromCsvConfig::default();
46/// assert_eq!(config.max_rows, 1_000_000);
47///
48/// // Increase limit for large dataset processing
49/// let config = FromCsvConfig {
50///     max_rows: 10_000_000, // 10 million rows
51///     ..Default::default()
52/// };
53/// ```
54pub const DEFAULT_MAX_ROWS: usize = 1_000_000;
55
56/// Configuration for CSV parsing.
57///
58/// This structure controls all aspects of CSV parsing behavior, including delimiters,
59/// headers, whitespace handling, security limits, and custom list naming.
60///
61/// # Examples
62///
63/// ## Default Configuration
64///
65/// ```
66/// # use hedl_csv::FromCsvConfig;
67/// let config = FromCsvConfig::default();
68/// assert_eq!(config.delimiter, b',');
69/// assert!(config.has_headers);
70/// assert!(config.trim);
71/// assert_eq!(config.max_rows, 1_000_000);
72/// assert_eq!(config.list_key, None);
73/// ```
74///
75/// ## Tab-Delimited without Headers
76///
77/// ```
78/// # use hedl_csv::FromCsvConfig;
79/// let config = FromCsvConfig {
80///     delimiter: b'\t',
81///     has_headers: false,
82///     ..Default::default()
83/// };
84/// ```
85///
86/// ## Custom Row Limit for Large Datasets
87///
88/// ```
89/// # use hedl_csv::FromCsvConfig;
90/// let config = FromCsvConfig {
91///     max_rows: 10_000_000, // Allow up to 10M rows
92///     ..Default::default()
93/// };
94/// ```
95///
96/// ## Disable Whitespace Trimming
97///
98/// ```
99/// # use hedl_csv::FromCsvConfig;
100/// let config = FromCsvConfig {
101///     trim: false,
102///     ..Default::default()
103/// };
104/// ```
105///
106/// ## Enable Schema Inference
107///
108/// ```
109/// # use hedl_csv::FromCsvConfig;
110/// let config = FromCsvConfig {
111///     infer_schema: true,
112///     sample_rows: 200, // Sample first 200 rows
113///     ..Default::default()
114/// };
115/// ```
116///
117/// ## Custom List Key for Irregular Plurals
118///
119/// ```
120/// # use hedl_csv::FromCsvConfig;
121/// // For "Person" type, use "people" instead of default "persons"
122/// let config = FromCsvConfig {
123///     list_key: Some("people".to_string()),
124///     ..Default::default()
125/// };
126/// ```
127#[derive(Debug, Clone)]
128pub struct FromCsvConfig {
129    /// Field delimiter character (default: `,`).
130    ///
131    /// Common alternatives:
132    /// - `b'\t'` - Tab-separated values (TSV)
133    /// - `b';'` - Semicolon-separated (common in European locales)
134    /// - `b'|'` - Pipe-separated
135    pub delimiter: u8,
136
137    /// Whether the first row contains column headers (default: `true`).
138    ///
139    /// When `true`, the first row is interpreted as column names and not included
140    /// in the data. When `false`, all rows are treated as data.
141    pub has_headers: bool,
142
143    /// Whether to trim leading/trailing whitespace from fields (default: `true`).
144    ///
145    /// When `true`, fields like `"  value  "` become `"value"`. This is generally
146    /// recommended to handle inconsistently formatted CSV files.
147    pub trim: bool,
148
149    /// Maximum number of rows to parse (default: 1,000,000).
150    ///
151    /// This security limit prevents memory exhaustion from maliciously large CSV files.
152    /// Processing stops with an error if more rows are encountered.
153    ///
154    /// # Security Impact
155    ///
156    /// - **DoS Protection**: Prevents attackers from causing memory exhaustion
157    /// - **Memory Bound**: Limits worst-case memory usage to approximately
158    ///   `max_rows × avg_row_size × columns`
159    /// - **Recommended Values**:
160    ///   - Small deployments: 100,000 - 1,000,000 rows
161    ///   - Large deployments: 1,000,000 - 10,000,000 rows
162    ///   - Batch processing: Adjust based on available RAM
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// # use hedl_csv::FromCsvConfig;
168    /// // For processing very large datasets on a high-memory server
169    /// let config = FromCsvConfig {
170    ///     max_rows: 50_000_000,
171    ///     ..Default::default()
172    /// };
173    /// ```
174    pub max_rows: usize,
175
176    /// Whether to automatically infer column types from data (default: `false`).
177    ///
178    /// When `true`, the parser samples the first `sample_rows` to determine the
179    /// most specific type for each column. When `false`, uses standard per-value
180    /// type inference.
181    ///
182    /// # Type Inference Hierarchy (most to least specific)
183    ///
184    /// 1. **Null**: All values are empty/null
185    /// 2. **Bool**: All values are "true" or "false"
186    /// 3. **Int**: All values parse as integers
187    /// 4. **Float**: All values parse as floats
188    /// 5. **String**: Fallback for all other cases
189    ///
190    /// # Example
191    ///
192    /// ```
193    /// # use hedl_csv::FromCsvConfig;
194    /// let config = FromCsvConfig {
195    ///     infer_schema: true,
196    ///     sample_rows: 100,
197    ///     ..Default::default()
198    /// };
199    /// ```
200    pub infer_schema: bool,
201
202    /// Number of rows to sample for schema inference (default: 100).
203    ///
204    /// Only used when `infer_schema` is `true`. Larger sample sizes provide
205    /// more accurate type detection but slower initial processing.
206    ///
207    /// # Trade-offs
208    ///
209    /// - **Small (10-50)**: Fast inference, may miss edge cases
210    /// - **Medium (100-500)**: Balanced accuracy and performance
211    /// - **Large (1000+)**: High accuracy, slower for large datasets
212    pub sample_rows: usize,
213
214    /// Custom key name for the matrix list in the document (default: `None`).
215    ///
216    /// When `None`, the list key is automatically generated by adding 's' to the
217    /// lowercased type name (e.g., "Person" → "persons"). When `Some`, uses the
218    /// specified custom key instead.
219    ///
220    /// # Use Cases
221    ///
222    /// - **Irregular Plurals**: "Person" → "people" instead of "persons"
223    /// - **Collective Nouns**: "Data" → "dataset" instead of "datas"
224    /// - **Custom Naming**: Any non-standard naming convention
225    /// - **Case-Sensitive Keys**: Preserve specific casing requirements
226    ///
227    /// # Examples
228    ///
229    /// ## Irregular Plural
230    ///
231    /// ```
232    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
233    /// let csv = "id,name\n1,Alice\n";
234    /// let config = FromCsvConfig {
235    ///     list_key: Some("people".to_string()),
236    ///     ..Default::default()
237    /// };
238    /// let doc = from_csv_with_config(csv, "Person", &["name"], config).unwrap();
239    /// assert!(doc.get("people").is_some()); // Uses custom plural
240    /// assert!(doc.get("persons").is_none()); // Default plural not used
241    /// ```
242    ///
243    /// ## Collective Noun
244    ///
245    /// ```
246    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
247    /// let csv = "id,value\n1,42\n";
248    /// let config = FromCsvConfig {
249    ///     list_key: Some("dataset".to_string()),
250    ///     ..Default::default()
251    /// };
252    /// let doc = from_csv_with_config(csv, "Data", &["value"], config).unwrap();
253    /// assert!(doc.get("dataset").is_some());
254    /// ```
255    ///
256    /// ## Case-Sensitive Key
257    ///
258    /// ```
259    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
260    /// let csv = "id,value\n1,test\n";
261    /// let config = FromCsvConfig {
262    ///     list_key: Some("MyCustomList".to_string()),
263    ///     ..Default::default()
264    /// };
265    /// let doc = from_csv_with_config(csv, "Item", &["value"], config).unwrap();
266    /// assert!(doc.get("MyCustomList").is_some());
267    /// ```
268    pub list_key: Option<String>,
269}
270
271impl Default for FromCsvConfig {
272    fn default() -> Self {
273        Self {
274            delimiter: b',',
275            has_headers: true,
276            trim: true,
277            max_rows: DEFAULT_MAX_ROWS,
278            infer_schema: false,
279            sample_rows: 100,
280            list_key: None,
281        }
282    }
283}
284
285/// Parse CSV string into a HEDL document with default configuration.
286///
287/// This is the primary entry point for CSV parsing. It uses sensible defaults:
288/// - Comma delimiter
289/// - Headers expected in first row
290/// - Whitespace trimming enabled
291/// - 1 million row limit for security
292///
293/// # Arguments
294///
295/// * `csv` - The CSV string to parse
296/// * `type_name` - The HEDL type name for rows (e.g., "Person")
297/// * `schema` - Column names excluding the 'id' column (which is always first)
298///
299/// # Returns
300///
301/// A `Document` containing a single matrix list with the parsed data, or an error
302/// if parsing fails.
303///
304/// # Errors
305///
306/// Returns `HedlError` in the following cases:
307///
308/// - `Syntax`: Malformed CSV records or invalid UTF-8
309/// - `Schema`: Missing ID column or field count mismatch
310/// - `Semantic`: Empty ID field
311/// - `Security`: Row count exceeds maximum (default 1M rows)
312///
313/// # Type Inference
314///
315/// Values are automatically inferred from CSV text:
316///
317/// - Empty string or `~` → `Value::Null`
318/// - `true`/`false` → `Value::Bool`
319/// - Integer pattern → `Value::Int` (e.g., "42", "-123")
320/// - Float pattern → `Value::Float` (e.g., "3.14", "1.5e10")
321/// - Special floats: `NaN`, `Infinity`, `-Infinity`
322/// - `@id` or `@Type:id` → `Value::Reference`
323/// - `$(expr)` → `Value::Expression`
324/// - `[1,2,3]` → `Value::Tensor`
325/// - Otherwise → `Value::String`
326///
327/// # Examples
328///
329/// ## Basic Usage
330///
331/// ```
332/// use hedl_csv::from_csv;
333/// use hedl_core::Value;
334///
335/// let csv_data = "id,name,age\n1,Alice,30\n2,Bob,25";
336/// let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
337///
338/// // Access the parsed data
339/// let list = doc.get("persons").unwrap().as_list().unwrap();
340/// assert_eq!(list.rows.len(), 2);
341/// assert_eq!(list.rows[0].id, "1");
342/// ```
343///
344/// ## Mixed Type Inference
345///
346/// ```
347/// use hedl_csv::from_csv;
348/// use hedl_core::Value;
349///
350/// let csv_data = "id,value\n1,42\n2,3.14\n3,true\n4,hello";
351/// let doc = from_csv(csv_data, "Item", &["value"]).unwrap();
352///
353/// let list = doc.get("items").unwrap().as_list().unwrap();
354/// assert!(matches!(list.rows[0].fields[1], Value::Int(42)));
355/// assert!(matches!(list.rows[1].fields[1], Value::Float(f) if (f - 3.14).abs() < 0.001));
356/// assert!(matches!(list.rows[2].fields[1], Value::Bool(true)));
357/// assert!(matches!(list.rows[3].fields[1], Value::String(_)));
358/// ```
359///
360/// ## References
361///
362/// ```
363/// use hedl_csv::from_csv;
364///
365/// let csv_data = "id,owner\n1,@user1\n2,@User:alice";
366/// let doc = from_csv(csv_data, "Item", &["owner"]).unwrap();
367///
368/// let list = doc.get("items").unwrap().as_list().unwrap();
369/// let ref1 = list.rows[0].fields[1].as_reference().unwrap();
370/// assert_eq!(ref1.id, "user1");
371/// assert_eq!(ref1.type_name, None); // Local reference
372///
373/// let ref2 = list.rows[1].fields[1].as_reference().unwrap();
374/// assert_eq!(ref2.id, "alice");
375/// assert_eq!(ref2.type_name, Some("User".to_string())); // Qualified reference
376/// ```
377///
378/// # Performance
379///
380/// - **Streaming**: Processes CSV row-by-row to minimize memory usage
381/// - **Memory bound**: O(rows × columns) space complexity
382/// - **Time complexity**: O(rows × columns) with efficient parsing
383///
384/// For very large files, consider using `from_csv_reader` for file I/O or
385/// increasing `max_rows` via `from_csv_with_config`.
386///
387/// # See Also
388///
389/// - `from_csv_with_config` - For custom delimiters, row limits, etc.
390/// - `from_csv_reader` - For parsing from files or network streams
391pub fn from_csv(csv: &str, type_name: &str, schema: &[&str]) -> Result<Document> {
392    from_csv_with_config(csv, type_name, schema, FromCsvConfig::default())
393}
394
395/// Parse CSV string into a HEDL document with custom configuration.
396///
397/// This function provides full control over CSV parsing behavior through `FromCsvConfig`.
398///
399/// # Arguments
400///
401/// * `csv` - The CSV string to parse
402/// * `type_name` - The HEDL type name for rows
403/// * `schema` - Column names excluding the 'id' column
404/// * `config` - Configuration controlling delimiter, headers, trimming, and row limits
405///
406/// # Examples
407///
408/// ## Tab-Separated Values (TSV)
409///
410/// ```
411/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
412///
413/// let tsv_data = "id\tname\tage\n1\tAlice\t30";
414/// let config = FromCsvConfig {
415///     delimiter: b'\t',
416///     ..Default::default()
417/// };
418/// let doc = from_csv_with_config(tsv_data, "Person", &["name", "age"], config).unwrap();
419/// ```
420///
421/// ## Custom Row Limit
422///
423/// ```
424/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
425///
426/// let config = FromCsvConfig {
427///     max_rows: 10_000_000, // Allow up to 10M rows
428///     ..Default::default()
429/// };
430/// let csv_data = "id,value\n1,test";
431/// let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
432/// ```
433///
434/// ## Disable Whitespace Trimming
435///
436/// ```
437/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
438/// use hedl_core::Value;
439///
440/// let csv_data = "id,name\n1,  Alice  ";
441/// let config = FromCsvConfig {
442///     trim: false,
443///     ..Default::default()
444/// };
445/// let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
446///
447/// let list = doc.get("persons").unwrap().as_list().unwrap();
448/// assert_eq!(list.rows[0].fields[1], Value::String("  Alice  ".to_string()));
449/// ```
450///
451/// # See Also
452///
453/// - `from_csv` - Convenience function with default configuration
454/// - `from_csv_reader_with_config` - For streaming from files/network
455pub fn from_csv_with_config(
456    csv: &str,
457    type_name: &str,
458    schema: &[&str],
459    config: FromCsvConfig,
460) -> Result<Document> {
461    from_csv_reader_with_config(csv.as_bytes(), type_name, schema, config)
462}
463
464/// Parse CSV from a reader into a HEDL document with default configuration.
465///
466/// This function is useful for processing CSV files or network streams without
467/// loading the entire content into memory first.
468///
469/// # Arguments
470///
471/// * `reader` - Any type implementing `Read` (e.g., `File`, `TcpStream`, `&[u8]`)
472/// * `type_name` - The HEDL type name for rows
473/// * `schema` - Column names excluding the 'id' column
474///
475/// # Examples
476///
477/// ## Reading from a File
478///
479/// ```no_run
480/// use hedl_csv::from_csv_reader;
481/// use std::fs::File;
482///
483/// let file = File::open("data.csv").unwrap();
484/// let doc = from_csv_reader(file, "Person", &["name", "age"]).unwrap();
485/// ```
486///
487/// ## Reading from a Byte Slice
488///
489/// ```
490/// use hedl_csv::from_csv_reader;
491///
492/// let csv_bytes = b"id,name\n1,Alice";
493/// let doc = from_csv_reader(&csv_bytes[..], "Person", &["name"]).unwrap();
494/// ```
495///
496/// ## Reading from Standard Input
497///
498/// ```no_run
499/// use hedl_csv::from_csv_reader;
500/// use std::io;
501///
502/// let stdin = io::stdin();
503/// let doc = from_csv_reader(stdin.lock(), "Record", &["field1", "field2"]).unwrap();
504/// ```
505///
506/// # Performance
507///
508/// This function uses streaming I/O to minimize memory usage. The CSV data is
509/// processed row-by-row without buffering the entire file.
510///
511/// # See Also
512///
513/// - `from_csv_reader_with_config` - For custom delimiters and limits
514/// - `from_csv` - For parsing CSV strings
515pub fn from_csv_reader<R: Read>(
516    reader: R,
517    type_name: &str,
518    schema: &[&str],
519) -> Result<Document> {
520    from_csv_reader_with_config(reader, type_name, schema, FromCsvConfig::default())
521}
522
523/// Inferred column type from sampling CSV data.
524#[derive(Debug, Clone, Copy, PartialEq, Eq)]
525enum ColumnType {
526    /// All sampled values are null/empty
527    Null,
528    /// All sampled values are "true" or "false"
529    Bool,
530    /// All sampled values parse as integers
531    Int,
532    /// All sampled values parse as floats (but not all as integers)
533    Float,
534    /// Default fallback for mixed or string data
535    String,
536}
537
538/// Infer the type of a single column from sampled values.
539///
540/// # Type Inference Rules
541///
542/// The function examines non-null values and determines the most specific type:
543///
544/// 1. If all values are null → `ColumnType::Null`
545/// 2. If all values are "true"/"false" → `ColumnType::Bool`
546/// 3. If all values parse as i64 → `ColumnType::Int`
547/// 4. If all values parse as f64 → `ColumnType::Float`
548/// 5. Otherwise → `ColumnType::String`
549///
550/// # Arguments
551///
552/// * `values` - Iterator over string values from a column
553///
554/// # Examples
555///
556/// ```text
557/// let values = vec!["1", "2", "3"];
558/// let col_type = infer_column_type(values.iter().map(|s| s.as_str()));
559/// assert_eq!(col_type, ColumnType::Int);
560/// ```
561fn infer_column_type<'a, I>(values: I) -> ColumnType
562where
563    I: Iterator<Item = &'a str>,
564{
565    let mut all_null = true;
566    let mut all_bool = true;
567    let mut all_int = true;
568    let mut all_float = true;
569
570    for value in values {
571        let trimmed = value.trim();
572
573        // Skip null values (don't affect type inference)
574        if trimmed.is_empty() || trimmed == "~" || trimmed == "null" {
575            continue;
576        }
577
578        all_null = false;
579
580        // Check bool
581        if trimmed != "true" && trimmed != "false" {
582            all_bool = false;
583        }
584
585        // Check int
586        if trimmed.parse::<i64>().is_err() {
587            all_int = false;
588        }
589
590        // Check float
591        if trimmed.parse::<f64>().is_err() {
592            all_float = false;
593        }
594
595        // Early exit if we know it's a string
596        if !all_bool && !all_int && !all_float {
597            return ColumnType::String;
598        }
599    }
600
601    // Determine type based on inference (most specific to least)
602    if all_null {
603        ColumnType::Null
604    } else if all_bool {
605        ColumnType::Bool
606    } else if all_int {
607        ColumnType::Int
608    } else if all_float {
609        ColumnType::Float
610    } else {
611        ColumnType::String
612    }
613}
614
615/// Infer types for all columns by sampling CSV records.
616///
617/// # Arguments
618///
619/// * `records` - Slice of CSV records (each record is a Vec<String>)
620/// * `sample_size` - Maximum number of records to sample
621///
622/// # Returns
623///
624/// A vector of `ColumnType` for each column in the CSV.
625///
626/// # Examples
627///
628/// ```text
629/// let records = vec![
630///     vec!["1".to_string(), "Alice".to_string(), "30".to_string()],
631///     vec!["2".to_string(), "Bob".to_string(), "25".to_string()],
632/// ];
633/// let types = infer_column_types(&records, 100);
634/// assert_eq!(types, vec![ColumnType::Int, ColumnType::String, ColumnType::Int]);
635/// ```
636fn infer_column_types(records: &[Vec<String>], sample_size: usize) -> Vec<ColumnType> {
637    if records.is_empty() {
638        return Vec::new();
639    }
640
641    let num_columns = records[0].len();
642    let sample_count = sample_size.min(records.len());
643
644    (0..num_columns)
645        .map(|col_idx| {
646            let column_values = records
647                .iter()
648                .take(sample_count)
649                .filter_map(|row| row.get(col_idx).map(|s| s.as_str()));
650
651            infer_column_type(column_values)
652        })
653        .collect()
654}
655
656/// Parse a CSV value using a specific inferred type.
657///
658/// This function forces type conversion based on the inferred schema,
659/// falling back to string on conversion failure.
660///
661/// # Arguments
662///
663/// * `field` - The string value to parse
664/// * `col_type` - The inferred column type
665///
666/// # Returns
667///
668/// A HEDL `Value` of the specified type, or `Value::String` if conversion fails.
669fn parse_csv_value_with_type(field: &str, col_type: ColumnType) -> Result<Value> {
670    let trimmed = field.trim();
671
672    // Always handle null values regardless of inferred type
673    if trimmed.is_empty() || trimmed == "~" {
674        return Ok(Value::Null);
675    }
676
677    match col_type {
678        ColumnType::Null => Ok(Value::Null),
679        ColumnType::Bool => {
680            if trimmed == "true" {
681                Ok(Value::Bool(true))
682            } else if trimmed == "false" {
683                Ok(Value::Bool(false))
684            } else {
685                // Fallback to string if not a valid bool
686                Ok(Value::String(field.to_string()))
687            }
688        }
689        ColumnType::Int => {
690            if let Ok(n) = trimmed.parse::<i64>() {
691                Ok(Value::Int(n))
692            } else {
693                // Fallback to string if not a valid int
694                Ok(Value::String(field.to_string()))
695            }
696        }
697        ColumnType::Float => {
698            if let Ok(f) = trimmed.parse::<f64>() {
699                Ok(Value::Float(f))
700            } else {
701                // Fallback to string if not a valid float
702                Ok(Value::String(field.to_string()))
703            }
704        }
705        ColumnType::String => {
706            // Use the original parse_csv_value for full type detection
707            // (handles references, expressions, tensors, etc.)
708            parse_csv_value(field)
709        }
710    }
711}
712
713/// Parse CSV from a reader into a HEDL document with custom configuration.
714///
715/// This is the most flexible CSV parsing function, supporting both custom I/O sources
716/// and custom parsing configuration.
717///
718/// # Arguments
719///
720/// * `reader` - Any type implementing `Read`
721/// * `type_name` - The HEDL type name for rows
722/// * `schema` - Column names excluding the 'id' column
723/// * `config` - Configuration controlling all parsing behavior
724///
725/// # Examples
726///
727/// ## Large File with Custom Limit
728///
729/// ```no_run
730/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
731/// use std::fs::File;
732///
733/// let file = File::open("large_dataset.csv").unwrap();
734/// let config = FromCsvConfig {
735///     max_rows: 50_000_000, // 50M rows for high-memory server
736///     ..Default::default()
737/// };
738/// let doc = from_csv_reader_with_config(file, "Record", &["value"], config).unwrap();
739/// ```
740///
741/// ## TSV from Network Stream
742///
743/// ```no_run
744/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
745/// use std::net::TcpStream;
746///
747/// let stream = TcpStream::connect("example.com:8080").unwrap();
748/// let config = FromCsvConfig {
749///     delimiter: b'\t',
750///     ..Default::default()
751/// };
752/// let doc = from_csv_reader_with_config(stream, "Data", &["col1", "col2"], config).unwrap();
753/// ```
754///
755/// # Implementation Details
756///
757/// The function performs the following steps:
758///
759/// 1. Creates a CSV reader with the specified configuration
760/// 2. Initializes a new HEDL document with version (1, 0)
761/// 3. Constructs the full schema (ID column + provided columns)
762/// 4. Registers the struct type in the document
763/// 5. Iterates through CSV records:
764///    - Checks row count against `max_rows` security limit
765///    - Parses each field using type inference
766///    - Validates field count matches schema
767///    - Creates `Node` instances and adds to matrix list
768/// 6. Inserts the completed matrix list into the document
769///
770/// # See Also
771///
772/// - `from_csv_with_config` - For parsing CSV strings
773/// - `FromCsvConfig` - Configuration options documentation
774pub fn from_csv_reader_with_config<R: Read>(
775    reader: R,
776    type_name: &str,
777    schema: &[&str],
778    config: FromCsvConfig,
779) -> Result<Document> {
780    let mut csv_reader = csv::ReaderBuilder::new()
781        .delimiter(config.delimiter)
782        .has_headers(config.has_headers)
783        .trim(if config.trim {
784            csv::Trim::All
785        } else {
786            csv::Trim::None
787        })
788        .from_reader(reader);
789
790    let mut doc = Document::new((1, 0));
791
792    // Create schema with 'id' column
793    let mut full_schema = vec!["id".to_string()];
794    full_schema.extend(schema.iter().map(|s| s.to_string()));
795
796    // Register the struct type
797    doc.structs
798        .insert(type_name.to_string(), full_schema.clone());
799
800    // Create matrix list
801    let mut matrix_list = MatrixList::new(type_name, full_schema.clone());
802
803    // If schema inference is enabled, collect records first
804    let _inferred_types = if config.infer_schema {
805        // Collect records for sampling
806        let mut all_records = Vec::new();
807        for (record_idx, result) in csv_reader.records().enumerate() {
808            // Security: Limit row count to prevent memory exhaustion
809            if record_idx >= config.max_rows {
810                return Err(CsvError::SecurityLimit {
811                limit: config.max_rows,
812                actual: record_idx + 1,
813            });
814            }
815
816            let record = result.map_err(|e| {
817                CsvError::ParseError {
818                line: record_idx + 1,
819                message: e.to_string(),
820            }
821            })?;
822
823            if record.is_empty() {
824                continue;
825            }
826
827            // Convert StringRecord to Vec<String>
828            let row: Vec<String> = record.iter().map(|s| s.to_string()).collect();
829            all_records.push(row);
830        }
831
832        // Infer column types from sampled records
833        let types = infer_column_types(&all_records, config.sample_rows);
834
835        // Process all records with inferred types
836        for (record_idx, row) in all_records.iter().enumerate() {
837            // First column is the ID
838            let id = row.first().ok_or_else(|| {
839                CsvError::MissingColumn("id".to_string())
840            })?;
841
842            if id.is_empty() {
843                return Err(CsvError::EmptyId {
844                row: record_idx + 1,
845            });
846            }
847
848            // Parse ALL fields (including ID) with inferred types
849            let mut fields = Vec::new();
850            for (field_idx, field) in row.iter().enumerate() {
851                let col_type = types.get(field_idx).copied().unwrap_or(ColumnType::String);
852                let value = parse_csv_value_with_type(field, col_type).map_err(|e| {
853                    e.with_context(format!(
854                        "in column '{}' at line {}",
855                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
856                        record_idx + 1
857                    ))
858                })?;
859                fields.push(value);
860            }
861
862            // Check field count matches full schema (including ID)
863            if fields.len() != full_schema.len() {
864                return Err(CsvError::WidthMismatch {
865                expected: full_schema.len(),
866                actual: fields.len(),
867                row: record_idx + 1,
868            });
869            }
870
871            let node = Node::new(type_name, id, fields);
872            matrix_list.add_row(node);
873        }
874
875        types
876    } else {
877        // Standard parsing without schema inference
878        for (record_idx, result) in csv_reader.records().enumerate() {
879            // Security: Limit row count to prevent memory exhaustion
880            if record_idx >= config.max_rows {
881                return Err(CsvError::SecurityLimit {
882                limit: config.max_rows,
883                actual: record_idx + 1,
884            });
885            }
886
887            let record = result.map_err(|e| {
888                CsvError::ParseError {
889                line: record_idx + 1,
890                message: e.to_string(),
891            }
892            })?;
893
894            if record.is_empty() {
895                continue;
896            }
897
898            // First column is the ID
899            let id = record.get(0).ok_or_else(|| {
900                CsvError::MissingColumn("id".to_string())
901            })?;
902
903            if id.is_empty() {
904                return Err(CsvError::EmptyId {
905                row: record_idx + 1,
906            });
907            }
908
909            // Parse ALL fields (including ID) per SPEC
910            let mut fields = Vec::new();
911            for (field_idx, field) in record.iter().enumerate() {
912                let value = parse_csv_value(field).map_err(|e| {
913                    e.with_context(format!(
914                        "in column '{}' at line {}",
915                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
916                        record_idx + 1
917                    ))
918                })?;
919                fields.push(value);
920            }
921
922            // Check field count matches full schema (including ID)
923            if fields.len() != full_schema.len() {
924                return Err(CsvError::WidthMismatch {
925                expected: full_schema.len(),
926                actual: fields.len(),
927                row: record_idx + 1,
928            });
929            }
930
931            let node = Node::new(type_name, id, fields);
932            matrix_list.add_row(node);
933        }
934
935        Vec::new()
936    };
937
938    // Add matrix list to document with custom or default key
939    let list_key = config
940        .list_key
941        .unwrap_or_else(|| format!("{}s", type_name.to_lowercase()));
942
943    doc.root.insert(list_key, Item::List(matrix_list));
944
945    Ok(doc)
946}
947
948/// Parse a CSV field value into a HEDL Value.
949///
950/// Type inference rules:
951/// - Empty string → Null
952/// - "true" or "false" → Bool
953/// - Integer pattern → Int
954/// - Float pattern → Float
955/// - Reference pattern (@...) → Reference
956/// - Expression pattern $(...) → Expression
957/// - Otherwise → String
958fn parse_csv_value(field: &str) -> Result<Value> {
959    let trimmed = field.trim();
960
961    // Empty or null
962    if trimmed.is_empty() || trimmed == "~" {
963        return Ok(Value::Null);
964    }
965
966    // Boolean
967    if trimmed == "true" {
968        return Ok(Value::Bool(true));
969    }
970    if trimmed == "false" {
971        return Ok(Value::Bool(false));
972    }
973
974    // Special float values
975    match trimmed {
976        "NaN" => return Ok(Value::Float(f64::NAN)),
977        "Infinity" => return Ok(Value::Float(f64::INFINITY)),
978        "-Infinity" => return Ok(Value::Float(f64::NEG_INFINITY)),
979        _ => {}
980    }
981
982    // Reference
983    if trimmed.starts_with('@') {
984        return parse_reference(trimmed);
985    }
986
987    // Expression
988    if trimmed.starts_with("$(") && trimmed.ends_with(')') {
989        let expr = parse_expression_token(trimmed).map_err(|e| {
990            CsvError::ParseError {
991                line: 0,
992                message: format!("Invalid expression: {}", e),
993            }
994        })?;
995        return Ok(Value::Expression(expr));
996    }
997
998    // Try integer
999    if let Ok(n) = trimmed.parse::<i64>() {
1000        return Ok(Value::Int(n));
1001    }
1002
1003    // Try float
1004    if let Ok(f) = trimmed.parse::<f64>() {
1005        return Ok(Value::Float(f));
1006    }
1007
1008    // Tensor literal (starts with '[' and ends with ']')
1009    if trimmed.starts_with('[') && trimmed.ends_with(']') {
1010        if let Ok(tensor) = parse_tensor(trimmed) {
1011            return Ok(Value::Tensor(tensor));
1012        }
1013        // If parsing fails, fall through to string
1014    }
1015
1016    // Default to string
1017    Ok(Value::String(field.to_string()))
1018}
1019
1020/// Parse a reference string (e.g., "@user1" or "@User:user1").
1021fn parse_reference(s: &str) -> Result<Value> {
1022    let without_at = &s[1..];
1023
1024    if let Some(colon_pos) = without_at.find(':') {
1025        // Qualified reference: @Type:id
1026        let type_name = &without_at[..colon_pos];
1027        let id = &without_at[colon_pos + 1..];
1028
1029        if type_name.is_empty() || id.is_empty() {
1030            return Err(CsvError::ParseError {
1031            line: 0,
1032            message: format!("Invalid reference format: {}", s),
1033        });
1034        }
1035
1036        Ok(Value::Reference(hedl_core::Reference::qualified(
1037            type_name, id,
1038        )))
1039    } else {
1040        // Local reference: @id
1041        if without_at.is_empty() {
1042            return Err(CsvError::ParseError {
1043            line: 0,
1044            message: "Empty reference ID".to_string(),
1045        });
1046        }
1047
1048        Ok(Value::Reference(hedl_core::Reference::local(without_at)))
1049    }
1050}
1051
1052#[cfg(test)]
1053mod tests {
1054    use super::*;
1055    use hedl_core::lex::Tensor;
1056    use hedl_test::expr_value;
1057
1058    // ==================== FromCsvConfig tests ====================
1059
1060    #[test]
1061    fn test_from_csv_config_default() {
1062        let config = FromCsvConfig::default();
1063        assert_eq!(config.delimiter, b',');
1064        assert!(config.has_headers);
1065        assert!(config.trim);
1066        assert_eq!(config.max_rows, DEFAULT_MAX_ROWS);
1067    }
1068
1069    #[test]
1070    fn test_from_csv_config_debug() {
1071        let config = FromCsvConfig::default();
1072        let debug = format!("{:?}", config);
1073        assert!(debug.contains("FromCsvConfig"));
1074        assert!(debug.contains("delimiter"));
1075        assert!(debug.contains("has_headers"));
1076        assert!(debug.contains("trim"));
1077    }
1078
1079    #[test]
1080    fn test_from_csv_config_clone() {
1081        let config = FromCsvConfig {
1082            delimiter: b'\t',
1083            has_headers: false,
1084            trim: false,
1085            max_rows: 500_000,
1086            infer_schema: false,
1087            sample_rows: 100,
1088            list_key: None,
1089        };
1090        let cloned = config.clone();
1091        assert_eq!(cloned.delimiter, b'\t');
1092        assert!(!cloned.has_headers);
1093        assert!(!cloned.trim);
1094        assert_eq!(cloned.max_rows, 500_000);
1095        assert!(!cloned.infer_schema);
1096        assert_eq!(cloned.sample_rows, 100);
1097        assert_eq!(cloned.list_key, None);
1098    }
1099
1100    #[test]
1101    fn test_from_csv_config_all_options() {
1102        let config = FromCsvConfig {
1103            delimiter: b';',
1104            has_headers: true,
1105            trim: true,
1106            max_rows: 2_000_000,
1107            infer_schema: true,
1108            sample_rows: 200,
1109            list_key: Some("custom".to_string()),
1110        };
1111        assert_eq!(config.delimiter, b';');
1112        assert!(config.has_headers);
1113        assert!(config.trim);
1114        assert_eq!(config.max_rows, 2_000_000);
1115        assert!(config.infer_schema);
1116        assert_eq!(config.sample_rows, 200);
1117        assert_eq!(config.list_key, Some("custom".to_string()));
1118    }
1119
1120    #[test]
1121    fn test_max_rows_limit_enforcement() {
1122        // Create CSV with exactly max_rows + 1 rows
1123        let mut csv_data = String::from("id,value\n");
1124        let max_rows = 100;
1125        for i in 0..=max_rows {
1126            csv_data.push_str(&format!("{},test{}\n", i, i));
1127        }
1128
1129        let config = FromCsvConfig {
1130            max_rows,
1131            infer_schema: false,
1132            sample_rows: 100,
1133            ..Default::default()
1134        };
1135
1136        let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1137        assert!(result.is_err());
1138        let err = result.unwrap_err();
1139        assert!(matches!(err, CsvError::SecurityLimit { .. }));
1140        assert!(err.to_string().contains("Security limit"));
1141        assert!(err.to_string().contains(&max_rows.to_string()));
1142    }
1143
1144    #[test]
1145    fn test_max_rows_limit_not_exceeded() {
1146        // Create CSV with exactly max_rows rows
1147        let mut csv_data = String::from("id,value\n");
1148        let max_rows = 100;
1149        for i in 0..(max_rows - 1) {
1150            csv_data.push_str(&format!("{},test{}\n", i, i));
1151        }
1152
1153        let config = FromCsvConfig {
1154            max_rows,
1155            infer_schema: false,
1156            sample_rows: 100,
1157            ..Default::default()
1158        };
1159
1160        let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1161        assert!(result.is_ok());
1162        let doc = result.unwrap();
1163        let list = doc.get("items").unwrap().as_list().unwrap();
1164        assert_eq!(list.rows.len(), max_rows - 1);
1165    }
1166
1167    // ==================== from_csv basic tests ====================
1168
1169    #[test]
1170    fn test_from_csv_basic() {
1171        let csv_data = "id,name,age,active\n1,Alice,30,true\n2,Bob,25,false\n";
1172        let doc = from_csv(csv_data, "Person", &["name", "age", "active"]).unwrap();
1173
1174        // Check document structure
1175        assert_eq!(doc.version, (1, 0));
1176
1177        // Check schema registration
1178        let schema = doc.get_schema("Person").unwrap();
1179        assert_eq!(schema, &["id", "name", "age", "active"]);
1180
1181        // Check matrix list
1182        let item = doc.get("persons").unwrap();
1183        let list = item.as_list().unwrap();
1184        assert_eq!(list.type_name, "Person");
1185        assert_eq!(list.rows.len(), 2);
1186
1187        // Check first row
1188        let row1 = &list.rows[0];
1189        assert_eq!(row1.id, "1");
1190        assert_eq!(row1.fields.len(), schema.len()); // schema includes ID
1191        assert_eq!(row1.fields[0], Value::Int(1)); // ID field
1192        assert_eq!(row1.fields[1], Value::String("Alice".to_string()));
1193        assert_eq!(row1.fields[2], Value::Int(30));
1194        assert_eq!(row1.fields[3], Value::Bool(true));
1195
1196        // Check second row
1197        let row2 = &list.rows[1];
1198        assert_eq!(row2.id, "2");
1199        assert_eq!(row2.fields.len(), schema.len()); // schema includes ID
1200        assert_eq!(row2.fields[0], Value::Int(2)); // ID field
1201        assert_eq!(row2.fields[1], Value::String("Bob".to_string()));
1202        assert_eq!(row2.fields[2], Value::Int(25));
1203        assert_eq!(row2.fields[3], Value::Bool(false));
1204    }
1205
1206    #[test]
1207    fn test_from_csv_without_headers() {
1208        let csv_data = "1,Alice,30\n2,Bob,25\n";
1209        let config = FromCsvConfig {
1210            has_headers: false,
1211            ..Default::default()
1212        };
1213        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1214
1215        let item = doc.get("persons").unwrap();
1216        let list = item.as_list().unwrap();
1217        assert_eq!(list.rows.len(), 2);
1218    }
1219
1220    #[test]
1221    fn test_from_csv_custom_delimiter() {
1222        let csv_data = "id\tname\tage\n1\tAlice\t30\n2\tBob\t25\n";
1223        let config = FromCsvConfig {
1224            delimiter: b'\t',
1225            ..Default::default()
1226        };
1227        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1228
1229        let item = doc.get("persons").unwrap();
1230        let list = item.as_list().unwrap();
1231        assert_eq!(list.rows.len(), 2);
1232    }
1233
1234    #[test]
1235    fn test_from_csv_semicolon_delimiter() {
1236        let csv_data = "id;name;age\n1;Alice;30\n";
1237        let config = FromCsvConfig {
1238            delimiter: b';',
1239            ..Default::default()
1240        };
1241        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1242
1243        let item = doc.get("persons").unwrap();
1244        let list = item.as_list().unwrap();
1245        assert_eq!(list.rows.len(), 1);
1246        assert_eq!(list.rows[0].fields[1], Value::String("Alice".to_string()));
1247    }
1248
1249    #[test]
1250    fn test_from_csv_empty_file() {
1251        let csv_data = "id,name\n";
1252        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1253
1254        let item = doc.get("persons").unwrap();
1255        let list = item.as_list().unwrap();
1256        assert!(list.rows.is_empty());
1257    }
1258
1259    #[test]
1260    fn test_from_csv_single_row() {
1261        let csv_data = "id,name\n1,Alice\n";
1262        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1263
1264        let item = doc.get("persons").unwrap();
1265        let list = item.as_list().unwrap();
1266        assert_eq!(list.rows.len(), 1);
1267    }
1268
1269    // ==================== parse_csv_value tests ====================
1270
1271    #[test]
1272    fn test_parse_csv_value_null_empty() {
1273        assert_eq!(parse_csv_value("").unwrap(), Value::Null);
1274    }
1275
1276    #[test]
1277    fn test_parse_csv_value_null_tilde() {
1278        assert_eq!(parse_csv_value("~").unwrap(), Value::Null);
1279    }
1280
1281    #[test]
1282    fn test_parse_csv_value_null_whitespace() {
1283        assert_eq!(parse_csv_value("   ").unwrap(), Value::Null);
1284    }
1285
1286    #[test]
1287    fn test_parse_csv_value_bool_true() {
1288        assert_eq!(parse_csv_value("true").unwrap(), Value::Bool(true));
1289    }
1290
1291    #[test]
1292    fn test_parse_csv_value_bool_false() {
1293        assert_eq!(parse_csv_value("false").unwrap(), Value::Bool(false));
1294    }
1295
1296    #[test]
1297    fn test_parse_csv_value_int_positive() {
1298        assert_eq!(parse_csv_value("42").unwrap(), Value::Int(42));
1299    }
1300
1301    #[test]
1302    fn test_parse_csv_value_int_negative() {
1303        assert_eq!(parse_csv_value("-123").unwrap(), Value::Int(-123));
1304    }
1305
1306    #[test]
1307    fn test_parse_csv_value_int_zero() {
1308        assert_eq!(parse_csv_value("0").unwrap(), Value::Int(0));
1309    }
1310
1311    #[test]
1312    fn test_parse_csv_value_int_large() {
1313        assert_eq!(
1314            parse_csv_value("9223372036854775807").unwrap(),
1315            Value::Int(i64::MAX)
1316        );
1317    }
1318
1319    #[test]
1320    fn test_parse_csv_value_float_positive() {
1321        assert_eq!(parse_csv_value("3.25").unwrap(), Value::Float(3.25));
1322    }
1323
1324    #[test]
1325    fn test_parse_csv_value_float_negative() {
1326        assert_eq!(parse_csv_value("-2.5").unwrap(), Value::Float(-2.5));
1327    }
1328
1329    #[test]
1330    fn test_parse_csv_value_float_zero() {
1331        assert_eq!(parse_csv_value("0.0").unwrap(), Value::Float(0.0));
1332    }
1333
1334    #[test]
1335    fn test_parse_csv_value_float_scientific() {
1336        let val = parse_csv_value("1.5e10").unwrap();
1337        if let Value::Float(f) = val {
1338            assert!((f - 1.5e10).abs() < 1e5);
1339        } else {
1340            panic!("Expected float");
1341        }
1342    }
1343
1344    #[test]
1345    fn test_parse_csv_value_string() {
1346        assert_eq!(
1347            parse_csv_value("hello").unwrap(),
1348            Value::String("hello".to_string())
1349        );
1350    }
1351
1352    #[test]
1353    fn test_parse_csv_value_string_with_spaces() {
1354        assert_eq!(
1355            parse_csv_value("  hello world  ").unwrap(),
1356            Value::String("  hello world  ".to_string())
1357        );
1358    }
1359
1360    #[test]
1361    fn test_parse_csv_value_string_numeric_looking() {
1362        // Strings that look like numbers but have leading zeros
1363        assert_eq!(
1364            parse_csv_value("007").unwrap(),
1365            Value::Int(7) // Parsed as int
1366        );
1367    }
1368
1369    // ==================== Special float values ====================
1370
1371    #[test]
1372    fn test_parse_csv_value_nan() {
1373        let nan = parse_csv_value("NaN").unwrap();
1374        assert!(matches!(nan, Value::Float(f) if f.is_nan()));
1375    }
1376
1377    #[test]
1378    fn test_parse_csv_value_infinity() {
1379        let inf = parse_csv_value("Infinity").unwrap();
1380        assert_eq!(inf, Value::Float(f64::INFINITY));
1381    }
1382
1383    #[test]
1384    fn test_parse_csv_value_neg_infinity() {
1385        let neg_inf = parse_csv_value("-Infinity").unwrap();
1386        assert_eq!(neg_inf, Value::Float(f64::NEG_INFINITY));
1387    }
1388
1389    // ==================== Reference tests ====================
1390
1391    #[test]
1392    fn test_parse_csv_value_reference_local() {
1393        let ref_val = parse_csv_value("@user1").unwrap();
1394        if let Value::Reference(r) = ref_val {
1395            assert_eq!(r.id, "user1");
1396            assert_eq!(r.type_name, None);
1397        } else {
1398            panic!("Expected reference");
1399        }
1400    }
1401
1402    #[test]
1403    fn test_parse_csv_value_reference_qualified() {
1404        let ref_val = parse_csv_value("@User:user1").unwrap();
1405        if let Value::Reference(r) = ref_val {
1406            assert_eq!(r.id, "user1");
1407            assert_eq!(r.type_name, Some("User".to_string()));
1408        } else {
1409            panic!("Expected reference");
1410        }
1411    }
1412
1413    #[test]
1414    fn test_parse_csv_value_reference_with_dashes() {
1415        let ref_val = parse_csv_value("@my-item-123").unwrap();
1416        if let Value::Reference(r) = ref_val {
1417            assert_eq!(r.id, "my-item-123");
1418        } else {
1419            panic!("Expected reference");
1420        }
1421    }
1422
1423    #[test]
1424    fn test_parse_reference_empty_error() {
1425        let result = parse_reference("@");
1426        assert!(result.is_err());
1427        assert!(result
1428            .unwrap_err()
1429            .to_string()
1430            .contains("Empty reference ID"));
1431    }
1432
1433    #[test]
1434    fn test_parse_reference_empty_type_error() {
1435        let result = parse_reference("@:id");
1436        assert!(result.is_err());
1437        assert!(result
1438            .unwrap_err()
1439            .to_string()
1440            .contains("Invalid reference format"));
1441    }
1442
1443    #[test]
1444    fn test_parse_reference_empty_id_error() {
1445        let result = parse_reference("@Type:");
1446        assert!(result.is_err());
1447        assert!(result
1448            .unwrap_err()
1449            .to_string()
1450            .contains("Invalid reference format"));
1451    }
1452
1453    // ==================== Expression tests ====================
1454
1455    #[test]
1456    fn test_parse_csv_value_expression_identifier() {
1457        let expr = parse_csv_value("$(foo)").unwrap();
1458        assert_eq!(expr, expr_value("foo"));
1459    }
1460
1461    #[test]
1462    fn test_parse_csv_value_expression_call() {
1463        let expr = parse_csv_value("$(add(x, y))").unwrap();
1464        assert_eq!(expr, expr_value("add(x, y)"));
1465    }
1466
1467    #[test]
1468    fn test_parse_csv_value_expression_nested() {
1469        let expr = parse_csv_value("$(outer(inner(x)))").unwrap();
1470        if let Value::Expression(e) = expr {
1471            assert_eq!(e.to_string(), "outer(inner(x))");
1472        } else {
1473            panic!("Expected expression");
1474        }
1475    }
1476
1477    // ==================== Tensor tests ====================
1478
1479    #[test]
1480    fn test_parse_csv_value_tensor_1d() {
1481        let val = parse_csv_value("[1, 2, 3]").unwrap();
1482        if let Value::Tensor(Tensor::Array(arr)) = val {
1483            assert_eq!(arr.len(), 3);
1484        } else {
1485            panic!("Expected tensor array");
1486        }
1487    }
1488
1489    #[test]
1490    fn test_parse_csv_value_tensor_2d() {
1491        let val = parse_csv_value("[[1, 2], [3, 4]]").unwrap();
1492        if let Value::Tensor(Tensor::Array(outer)) = val {
1493            assert_eq!(outer.len(), 2);
1494            if let Tensor::Array(inner) = &outer[0] {
1495                assert_eq!(inner.len(), 2);
1496            } else {
1497                panic!("Expected nested array");
1498            }
1499        } else {
1500            panic!("Expected tensor array");
1501        }
1502    }
1503
1504    #[test]
1505    fn test_parse_csv_value_tensor_empty_is_string() {
1506        // Empty tensors are not valid in HEDL (must have at least one element)
1507        // So "[]" falls through to being treated as a string
1508        let val = parse_csv_value("[]").unwrap();
1509        assert_eq!(val, Value::String("[]".to_string()));
1510    }
1511
1512    // ==================== Error cases ====================
1513
1514    #[test]
1515    fn test_empty_id_error() {
1516        let csv_data = "id,name\n,Alice\n";
1517        let result = from_csv(csv_data, "Person", &["name"]);
1518        assert!(result.is_err());
1519        assert!(matches!(result.unwrap_err(), CsvError::EmptyId { .. }));
1520    }
1521
1522    #[test]
1523    fn test_mismatched_field_count() {
1524        let csv_data = "id,name,age\n1,Alice\n";
1525        let result = from_csv(csv_data, "Person", &["name", "age"]);
1526        assert!(result.is_err());
1527        // CSV parser returns Syntax error for malformed records
1528        assert!(matches!(result.unwrap_err(), CsvError::ParseError { .. }));
1529    }
1530
1531    // ==================== Whitespace handling ====================
1532
1533    #[test]
1534    fn test_whitespace_trimming_enabled() {
1535        let csv_data = "id,name,age\n1,  Alice  ,  30  \n";
1536        let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
1537
1538        let item = doc.get("persons").unwrap();
1539        let list = item.as_list().unwrap();
1540        let row = &list.rows[0];
1541
1542        assert_eq!(row.fields[0], Value::Int(1)); // ID field
1543        assert_eq!(row.fields[1], Value::String("Alice".to_string()));
1544        assert_eq!(row.fields[2], Value::Int(30));
1545    }
1546
1547    #[test]
1548    fn test_whitespace_trimming_disabled() {
1549        let csv_data = "id,name\n1,  Alice  \n";
1550        let config = FromCsvConfig {
1551            trim: false,
1552            ..Default::default()
1553        };
1554        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
1555
1556        let item = doc.get("persons").unwrap();
1557        let list = item.as_list().unwrap();
1558        // With trim disabled, whitespace is preserved
1559        assert_eq!(
1560            list.rows[0].fields[1],
1561            Value::String("  Alice  ".to_string())
1562        );
1563    }
1564
1565    // ==================== from_csv_reader tests ====================
1566
1567    #[test]
1568    fn test_from_csv_reader_basic() {
1569        let csv_data = "id,name\n1,Alice\n".as_bytes();
1570        let doc = from_csv_reader(csv_data, "Person", &["name"]).unwrap();
1571
1572        let item = doc.get("persons").unwrap();
1573        let list = item.as_list().unwrap();
1574        assert_eq!(list.rows.len(), 1);
1575    }
1576
1577    #[test]
1578    fn test_from_csv_reader_with_config() {
1579        let csv_data = "1\tAlice\n".as_bytes();
1580        let config = FromCsvConfig {
1581            delimiter: b'\t',
1582            has_headers: false,
1583            trim: true,
1584            ..Default::default()
1585        };
1586        let doc = from_csv_reader_with_config(csv_data, "Person", &["name"], config).unwrap();
1587
1588        let item = doc.get("persons").unwrap();
1589        let list = item.as_list().unwrap();
1590        assert_eq!(list.rows.len(), 1);
1591    }
1592
1593    // ==================== Type naming tests ====================
1594
1595    #[test]
1596    fn test_type_naming_singularization() {
1597        let csv_data = "id,name\n1,Alice\n";
1598        let doc = from_csv(csv_data, "User", &["name"]).unwrap();
1599
1600        // Matrix list should use "users" as key (lowercase + pluralized)
1601        let item = doc.get("users").unwrap();
1602        let list = item.as_list().unwrap();
1603        assert_eq!(list.type_name, "User");
1604    }
1605
1606    // ==================== Quoted fields ====================
1607
1608    #[test]
1609    fn test_quoted_fields() {
1610        let csv_data = "id,name,bio\n1,Alice,\"Hello, World\"\n";
1611        let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
1612
1613        let item = doc.get("persons").unwrap();
1614        let list = item.as_list().unwrap();
1615        assert_eq!(
1616            list.rows[0].fields[2],
1617            Value::String("Hello, World".to_string())
1618        );
1619    }
1620
1621    #[test]
1622    fn test_quoted_fields_with_newline() {
1623        let csv_data = "id,name,bio\n1,Alice,\"Line 1\nLine 2\"\n";
1624        let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
1625
1626        let item = doc.get("persons").unwrap();
1627        let list = item.as_list().unwrap();
1628        assert_eq!(
1629            list.rows[0].fields[2],
1630            Value::String("Line 1\nLine 2".to_string())
1631        );
1632    }
1633
1634    #[test]
1635    fn test_quoted_fields_with_quotes() {
1636        let csv_data = "id,name\n1,\"Alice \"\"Bob\"\" Smith\"\n";
1637        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1638
1639        let item = doc.get("persons").unwrap();
1640        let list = item.as_list().unwrap();
1641        assert_eq!(
1642            list.rows[0].fields[1],
1643            Value::String("Alice \"Bob\" Smith".to_string())
1644        );
1645    }
1646
1647    // ==================== Edge cases ====================
1648
1649    #[test]
1650    fn test_unicode_values() {
1651        let csv_data = "id,name\n1,héllo 世界\n";
1652        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1653
1654        let item = doc.get("persons").unwrap();
1655        let list = item.as_list().unwrap();
1656        assert_eq!(
1657            list.rows[0].fields[1],
1658            Value::String("héllo 世界".to_string())
1659        );
1660    }
1661
1662    #[test]
1663    fn test_string_id() {
1664        let csv_data = "id,name\nabc,Alice\n";
1665        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1666
1667        let item = doc.get("persons").unwrap();
1668        let list = item.as_list().unwrap();
1669        assert_eq!(list.rows[0].id, "abc");
1670        assert_eq!(list.rows[0].fields[0], Value::String("abc".to_string()));
1671    }
1672
1673    #[test]
1674    fn test_many_columns() {
1675        let csv_data = "id,a,b,c,d,e\n1,2,3,4,5,6\n";
1676        let doc = from_csv(csv_data, "Item", &["a", "b", "c", "d", "e"]).unwrap();
1677
1678        let item = doc.get("items").unwrap();
1679        let list = item.as_list().unwrap();
1680        assert_eq!(list.schema.len(), 6); // id + 5 columns
1681        assert_eq!(list.rows[0].fields.len(), 6);
1682    }
1683
1684    // ==================== Custom list_key tests ====================
1685
1686    #[test]
1687    fn test_custom_list_key_basic() {
1688        let csv_data = "id,name\n1,Alice\n";
1689        let config = FromCsvConfig {
1690            list_key: Some("people".to_string()),
1691            ..Default::default()
1692        };
1693        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
1694
1695        // Custom plural should exist
1696        assert!(doc.get("people").is_some());
1697        // Default plural should not exist
1698        assert!(doc.get("persons").is_none());
1699
1700        let list = doc.get("people").unwrap().as_list().unwrap();
1701        assert_eq!(list.type_name, "Person");
1702        assert_eq!(list.rows.len(), 1);
1703    }
1704
1705    #[test]
1706    fn test_custom_list_key_irregular_plurals() {
1707        // Test common irregular plurals
1708        let test_cases = vec![
1709            ("Person", "people"),
1710            ("Child", "children"),
1711            ("Tooth", "teeth"),
1712            ("Foot", "feet"),
1713            ("Mouse", "mice"),
1714            ("Goose", "geese"),
1715            ("Man", "men"),
1716            ("Woman", "women"),
1717            ("Ox", "oxen"),
1718            ("Datum", "data"),
1719        ];
1720
1721        for (type_name, plural) in test_cases {
1722            let csv_data = format!("id,value\n1,test\n");
1723            let config = FromCsvConfig {
1724                list_key: Some(plural.to_string()),
1725                ..Default::default()
1726            };
1727            let doc = from_csv_with_config(&csv_data, type_name, &["value"], config).unwrap();
1728
1729            assert!(
1730                doc.get(plural).is_some(),
1731                "Failed to find {} for type {}",
1732                plural,
1733                type_name
1734            );
1735        }
1736    }
1737
1738    #[test]
1739    fn test_custom_list_key_collective_nouns() {
1740        let csv_data = "id,value\n1,42\n";
1741
1742        // Test collective nouns
1743        let test_cases = vec![
1744            ("Data", "dataset"),
1745            ("Information", "info_collection"),
1746            ("Equipment", "gear"),
1747            ("Furniture", "furnishings"),
1748        ];
1749
1750        for (type_name, collective) in test_cases {
1751            let config = FromCsvConfig {
1752                list_key: Some(collective.to_string()),
1753                ..Default::default()
1754            };
1755            let doc = from_csv_with_config(&csv_data, type_name, &["value"], config).unwrap();
1756
1757            assert!(
1758                doc.get(collective).is_some(),
1759                "Failed to find {} for type {}",
1760                collective,
1761                type_name
1762            );
1763        }
1764    }
1765
1766    #[test]
1767    fn test_custom_list_key_case_sensitive() {
1768        let csv_data = "id,value\n1,test\n";
1769        let config = FromCsvConfig {
1770            list_key: Some("MyCustomList".to_string()),
1771            ..Default::default()
1772        };
1773        let doc = from_csv_with_config(&csv_data, "Item", &["value"], config).unwrap();
1774
1775        // Exact case should exist
1776        assert!(doc.get("MyCustomList").is_some());
1777        // Different case should not exist
1778        assert!(doc.get("mycustomlist").is_none());
1779        assert!(doc.get("items").is_none());
1780    }
1781
1782    #[test]
1783    fn test_custom_list_key_empty_string() {
1784        // Empty string is technically allowed as a key
1785        let csv_data = "id,value\n1,test\n";
1786        let config = FromCsvConfig {
1787            list_key: Some("".to_string()),
1788            ..Default::default()
1789        };
1790        let doc = from_csv_with_config(&csv_data, "Item", &["value"], config).unwrap();
1791
1792        assert!(doc.get("").is_some());
1793    }
1794
1795    #[test]
1796    fn test_custom_list_key_with_special_chars() {
1797        let csv_data = "id,value\n1,test\n";
1798        let config = FromCsvConfig {
1799            list_key: Some("my-custom_list.v2".to_string()),
1800            ..Default::default()
1801        };
1802        let doc = from_csv_with_config(&csv_data, "Item", &["value"], config).unwrap();
1803
1804        assert!(doc.get("my-custom_list.v2").is_some());
1805    }
1806
1807    #[test]
1808    fn test_custom_list_key_unicode() {
1809        let csv_data = "id,value\n1,test\n";
1810        let config = FromCsvConfig {
1811            list_key: Some("人々".to_string()), // Japanese for "people"
1812            ..Default::default()
1813        };
1814        let doc = from_csv_with_config(&csv_data, "Person", &["value"], config).unwrap();
1815
1816        assert!(doc.get("人々").is_some());
1817    }
1818
1819    #[test]
1820    fn test_custom_list_key_with_schema_inference() {
1821        let csv_data = "id,value\n1,42\n2,43\n3,44\n";
1822        let config = FromCsvConfig {
1823            list_key: Some("people".to_string()),
1824            infer_schema: true,
1825            sample_rows: 10,
1826            ..Default::default()
1827        };
1828        let doc = from_csv_with_config(&csv_data, "Person", &["value"], config).unwrap();
1829
1830        assert!(doc.get("people").is_some());
1831        let list = doc.get("people").unwrap().as_list().unwrap();
1832        assert_eq!(list.rows.len(), 3);
1833        // Schema inference should still work
1834        assert_eq!(list.rows[0].fields[1], Value::Int(42));
1835    }
1836
1837    #[test]
1838    fn test_custom_list_key_none_uses_default() {
1839        let csv_data = "id,name\n1,Alice\n";
1840        let config = FromCsvConfig {
1841            list_key: None,
1842            ..Default::default()
1843        };
1844        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
1845
1846        // Should use default pluralization
1847        assert!(doc.get("persons").is_some());
1848        assert!(doc.get("people").is_none());
1849    }
1850
1851    #[test]
1852    fn test_custom_list_key_default_config() {
1853        let csv_data = "id,name\n1,Alice\n";
1854        let doc = from_csv(csv_data, "User", &["name"]).unwrap();
1855
1856        // Default should use simple pluralization
1857        assert!(doc.get("users").is_some());
1858    }
1859
1860    #[test]
1861    fn test_custom_list_key_preserves_type_name() {
1862        let csv_data = "id,name\n1,Alice\n";
1863        let config = FromCsvConfig {
1864            list_key: Some("people".to_string()),
1865            ..Default::default()
1866        };
1867        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
1868
1869        let list = doc.get("people").unwrap().as_list().unwrap();
1870        // Type name should still be "Person", not "people"
1871        assert_eq!(list.type_name, "Person");
1872    }
1873
1874    #[test]
1875    fn test_custom_list_key_with_multiple_types() {
1876        // This test ensures each call can have its own list_key
1877        let csv1 = "id,name\n1,Alice\n";
1878        let config1 = FromCsvConfig {
1879            list_key: Some("people".to_string()),
1880            ..Default::default()
1881        };
1882        let doc1 = from_csv_with_config(csv1, "Person", &["name"], config1).unwrap();
1883
1884        let csv2 = "id,name\n1,Fluffy\n";
1885        let config2 = FromCsvConfig {
1886            list_key: Some("mice".to_string()),
1887            ..Default::default()
1888        };
1889        let doc2 = from_csv_with_config(csv2, "Mouse", &["name"], config2).unwrap();
1890
1891        assert!(doc1.get("people").is_some());
1892        assert!(doc1.get("persons").is_none());
1893
1894        assert!(doc2.get("mice").is_some());
1895        assert!(doc2.get("mouses").is_none());
1896    }
1897
1898    #[test]
1899    fn test_custom_list_key_numbers_in_name() {
1900        let csv_data = "id,value\n1,test\n";
1901        let config = FromCsvConfig {
1902            list_key: Some("items_v2".to_string()),
1903            ..Default::default()
1904        };
1905        let doc = from_csv_with_config(&csv_data, "Item", &["value"], config).unwrap();
1906
1907        assert!(doc.get("items_v2").is_some());
1908    }
1909
1910    #[test]
1911    fn test_custom_list_key_round_trip_compatibility() {
1912        // Ensure custom list keys work with to_csv_list
1913        let csv_data = "id,name\n1,Alice\n2,Bob\n";
1914        let config = FromCsvConfig {
1915            list_key: Some("people".to_string()),
1916            ..Default::default()
1917        };
1918        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
1919
1920        // Export the list using the custom key
1921        use crate::to_csv_list;
1922        let exported_csv = to_csv_list(&doc, "people").unwrap();
1923        assert!(exported_csv.contains("Alice"));
1924        assert!(exported_csv.contains("Bob"));
1925
1926        // Should not be accessible via default key
1927        assert!(to_csv_list(&doc, "persons").is_err());
1928    }
1929
1930    #[test]
1931    fn test_from_csv_config_clone_with_list_key() {
1932        let config = FromCsvConfig {
1933            delimiter: b',',
1934            has_headers: true,
1935            trim: true,
1936            max_rows: 1000,
1937            infer_schema: false,
1938            sample_rows: 50,
1939            list_key: Some("people".to_string()),
1940        };
1941        let cloned = config.clone();
1942        assert_eq!(cloned.list_key, Some("people".to_string()));
1943    }
1944
1945    #[test]
1946    fn test_from_csv_config_debug_with_list_key() {
1947        let config = FromCsvConfig {
1948            list_key: Some("people".to_string()),
1949            ..Default::default()
1950        };
1951        let debug = format!("{:?}", config);
1952        assert!(debug.contains("list_key"));
1953        assert!(debug.contains("people"));
1954    }
1955}
hedl_csv/from_csv.rs

hedl_csv/
from_csv.rs