hedl_csv/
from_csv.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Convert CSV files to HEDL documents.
19
20use crate::error::{CsvError, Result};
21use hedl_core::lex::parse_expression_token;
22use hedl_core::lex::parse_tensor;
23use hedl_core::{Document, Item, MatrixList, Node, Value};
24use std::io::Read;
25
26/// Default maximum number of rows to prevent memory exhaustion.
27///
28/// This limit prevents Denial-of-Service attacks from maliciously large CSV files.
29/// The default is 1 million rows, which allows processing reasonably large datasets
30/// while preventing unbounded memory allocation.
31///
32/// # Security Considerations
33///
34/// - **Memory exhaustion**: Without a limit, attackers could provide CSV files with
35///   billions of rows, causing the application to allocate excessive memory and crash.
36/// - **Configurable**: The limit can be adjusted via `FromCsvConfig::max_rows` based on
37///   deployment context and available resources.
38/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
39///
40/// # Examples
41///
42/// ```
43/// # use hedl_csv::FromCsvConfig;
44/// // Use default 1M row limit
45/// let config = FromCsvConfig::default();
46/// assert_eq!(config.max_rows, 1_000_000);
47///
48/// // Increase limit for large dataset processing
49/// let config = FromCsvConfig {
50///     max_rows: 10_000_000, // 10 million rows
51///     ..Default::default()
52/// };
53/// ```
54pub const DEFAULT_MAX_ROWS: usize = 1_000_000;
55
56/// Default maximum number of columns to prevent column bomb attacks.
57///
58/// This limit prevents Denial-of-Service attacks from CSV files with excessive columns.
59/// The default is 10,000 columns, which is generous but prevents abuse.
60///
61/// # Security Considerations
62///
63/// - **Column bomb**: Without a limit, attackers could provide CSV files with
64///   hundreds of thousands of columns, causing memory exhaustion and slow processing.
65/// - **Industry standards**: Excel limits to 16,384 columns, Google Sheets to 18,278.
66/// - **Trade-off**: Higher limits allow wider datasets but increase `DoS` risk.
67pub const DEFAULT_MAX_COLUMNS: usize = 10_000;
68
69/// Default maximum cell size in bytes to prevent cell bomb attacks.
70///
71/// This limit prevents Denial-of-Service attacks from CSV files with enormous cells.
72/// The default is 1MB per cell, which is reasonable for most legitimate use cases.
73///
74/// # Security Considerations
75///
76/// - **Cell bomb**: Without a limit, attackers could provide CSV files with
77///   gigabyte-sized cells, causing memory exhaustion.
78/// - **Cumulative effect**: Multiple large cells multiply the impact.
79/// - **Trade-off**: Higher limits allow larger text fields but increase `DoS` risk.
80pub const DEFAULT_MAX_CELL_SIZE: usize = 1_048_576; // 1MB
81
82/// Default maximum total CSV size in bytes to prevent decompression bombs.
83///
84/// This limit prevents Denial-of-Service attacks from compressed CSV files that
85/// decompress to enormous sizes. The default is 100MB.
86///
87/// # Security Considerations
88///
89/// - **Decompression bomb**: A 1MB gzipped file could decompress to 1GB+.
90/// - **Memory exhaustion**: Prevents attackers from filling server memory.
91/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
92pub const DEFAULT_MAX_TOTAL_SIZE: usize = 104_857_600; // 100MB
93
94/// Default maximum header size in bytes to prevent header bombs.
95///
96/// This limit prevents Denial-of-Service attacks from CSV files with enormous headers.
97/// The default is 1MB for the total header size.
98///
99/// # Security Considerations
100///
101/// - **Header bomb**: Prevents attackers from using huge column names.
102/// - **Per-column**: Also enforced per-column via `max_cell_size`.
103/// - **Trade-off**: Higher limits allow longer column names but increase `DoS` risk.
104pub const DEFAULT_MAX_HEADER_SIZE: usize = 1_048_576; // 1MB
105
106/// Configuration for CSV parsing.
107///
108/// This structure controls all aspects of CSV parsing behavior, including delimiters,
109/// headers, whitespace handling, security limits, and custom list naming.
110///
111/// # Examples
112///
113/// ## Default Configuration
114///
115/// ```
116/// # use hedl_csv::FromCsvConfig;
117/// let config = FromCsvConfig::default();
118/// assert_eq!(config.delimiter, b',');
119/// assert!(config.has_headers);
120/// assert!(config.trim);
121/// assert_eq!(config.max_rows, 1_000_000);
122/// assert_eq!(config.list_key, None);
123/// ```
124///
125/// ## Tab-Delimited without Headers
126///
127/// ```
128/// # use hedl_csv::FromCsvConfig;
129/// let config = FromCsvConfig {
130///     delimiter: b'\t',
131///     has_headers: false,
132///     ..Default::default()
133/// };
134/// ```
135///
136/// ## Custom Row Limit for Large Datasets
137///
138/// ```
139/// # use hedl_csv::FromCsvConfig;
140/// let config = FromCsvConfig {
141///     max_rows: 10_000_000, // Allow up to 10M rows
142///     ..Default::default()
143/// };
144/// ```
145///
146/// ## Disable Whitespace Trimming
147///
148/// ```
149/// # use hedl_csv::FromCsvConfig;
150/// let config = FromCsvConfig {
151///     trim: false,
152///     ..Default::default()
153/// };
154/// ```
155///
156/// ## Enable Schema Inference
157///
158/// ```
159/// # use hedl_csv::FromCsvConfig;
160/// let config = FromCsvConfig {
161///     infer_schema: true,
162///     sample_rows: 200, // Sample first 200 rows
163///     ..Default::default()
164/// };
165/// ```
166///
167/// ## Custom List Key for Irregular Plurals
168///
169/// ```
170/// # use hedl_csv::FromCsvConfig;
171/// // For "Person" type, use "people" instead of default "persons"
172/// let config = FromCsvConfig {
173///     list_key: Some("people".to_string()),
174///     ..Default::default()
175/// };
176/// ```
177#[derive(Debug, Clone)]
178pub struct FromCsvConfig {
179    /// Field delimiter character (default: `,`).
180    ///
181    /// Common alternatives:
182    /// - `b'\t'` - Tab-separated values (TSV)
183    /// - `b';'` - Semicolon-separated (common in European locales)
184    /// - `b'|'` - Pipe-separated
185    pub delimiter: u8,
186
187    /// Whether the first row contains column headers (default: `true`).
188    ///
189    /// When `true`, the first row is interpreted as column names and not included
190    /// in the data. When `false`, all rows are treated as data.
191    pub has_headers: bool,
192
193    /// Whether to trim leading/trailing whitespace from fields (default: `true`).
194    ///
195    /// When `true`, fields like `"  value  "` become `"value"`. This is generally
196    /// recommended to handle inconsistently formatted CSV files.
197    pub trim: bool,
198
199    /// Maximum number of rows to parse (default: 1,000,000).
200    ///
201    /// This security limit prevents memory exhaustion from maliciously large CSV files.
202    /// Processing stops with an error if more rows are encountered.
203    ///
204    /// # Security Impact
205    ///
206    /// - **`DoS` Protection**: Prevents attackers from causing memory exhaustion
207    /// - **Memory Bound**: Limits worst-case memory usage to approximately
208    ///   `max_rows × avg_row_size × columns`
209    /// - **Recommended Values**:
210    ///   - Small deployments: 100,000 - 1,000,000 rows
211    ///   - Large deployments: 1,000,000 - 10,000,000 rows
212    ///   - Batch processing: Adjust based on available RAM
213    ///
214    /// # Example
215    ///
216    /// ```
217    /// # use hedl_csv::FromCsvConfig;
218    /// // For processing very large datasets on a high-memory server
219    /// let config = FromCsvConfig {
220    ///     max_rows: 50_000_000,
221    ///     ..Default::default()
222    /// };
223    /// ```
224    pub max_rows: usize,
225
226    /// Whether to automatically infer column types from data (default: `false`).
227    ///
228    /// When `true`, the parser samples the first `sample_rows` to determine the
229    /// most specific type for each column. When `false`, uses standard per-value
230    /// type inference.
231    ///
232    /// # Type Inference Hierarchy (most to least specific)
233    ///
234    /// 1. **Null**: All values are empty/null
235    /// 2. **Bool**: All values are "true" or "false"
236    /// 3. **Int**: All values parse as integers
237    /// 4. **Float**: All values parse as floats
238    /// 5. **String**: Fallback for all other cases
239    ///
240    /// # Example
241    ///
242    /// ```
243    /// # use hedl_csv::FromCsvConfig;
244    /// let config = FromCsvConfig {
245    ///     infer_schema: true,
246    ///     sample_rows: 100,
247    ///     ..Default::default()
248    /// };
249    /// ```
250    pub infer_schema: bool,
251
252    /// Number of rows to sample for schema inference (default: 100).
253    ///
254    /// Only used when `infer_schema` is `true`. Larger sample sizes provide
255    /// more accurate type detection but slower initial processing.
256    ///
257    /// # Trade-offs
258    ///
259    /// - **Small (10-50)**: Fast inference, may miss edge cases
260    /// - **Medium (100-500)**: Balanced accuracy and performance
261    /// - **Large (1000+)**: High accuracy, slower for large datasets
262    pub sample_rows: usize,
263
264    /// Custom key name for the matrix list in the document (default: `None`).
265    ///
266    /// When `None`, the list key is automatically generated by adding 's' to the
267    /// lowercased type name (e.g., "Person" → "persons"). When `Some`, uses the
268    /// specified custom key instead.
269    ///
270    /// # Use Cases
271    ///
272    /// - **Irregular Plurals**: "Person" → "people" instead of "persons"
273    /// - **Collective Nouns**: "Data" → "dataset" instead of "datas"
274    /// - **Custom Naming**: Any non-standard naming convention
275    /// - **Case-Sensitive Keys**: Preserve specific casing requirements
276    ///
277    /// # Examples
278    ///
279    /// ## Irregular Plural
280    ///
281    /// ```
282    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
283    /// let csv = "id,name\n1,Alice\n";
284    /// let config = FromCsvConfig {
285    ///     list_key: Some("people".to_string()),
286    ///     ..Default::default()
287    /// };
288    /// let doc = from_csv_with_config(csv, "Person", &["name"], config).unwrap();
289    /// assert!(doc.get("people").is_some()); // Uses custom plural
290    /// assert!(doc.get("persons").is_none()); // Default plural not used
291    /// ```
292    ///
293    /// ## Collective Noun
294    ///
295    /// ```
296    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
297    /// let csv = "id,value\n1,42\n";
298    /// let config = FromCsvConfig {
299    ///     list_key: Some("dataset".to_string()),
300    ///     ..Default::default()
301    /// };
302    /// let doc = from_csv_with_config(csv, "Data", &["value"], config).unwrap();
303    /// assert!(doc.get("dataset").is_some());
304    /// ```
305    ///
306    /// ## Case-Sensitive Key
307    ///
308    /// ```
309    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
310    /// let csv = "id,value\n1,test\n";
311    /// let config = FromCsvConfig {
312    ///     list_key: Some("MyCustomList".to_string()),
313    ///     ..Default::default()
314    /// };
315    /// let doc = from_csv_with_config(csv, "Item", &["value"], config).unwrap();
316    /// assert!(doc.get("MyCustomList").is_some());
317    /// ```
318    pub list_key: Option<String>,
319
320    /// Maximum number of columns allowed (default: 10,000).
321    ///
322    /// This security limit prevents "column bomb" attacks where malicious CSV files
323    /// contain excessive columns that cause memory exhaustion and slow processing.
324    ///
325    /// # Security Impact
326    ///
327    /// - **`DoS` Protection**: Prevents attackers from creating CSVs with 50,000+ columns
328    /// - **Memory Bound**: Limits worst-case memory usage for column metadata
329    /// - **Industry Comparison**: Excel (16,384), Google Sheets (18,278), `PostgreSQL` (~1,600)
330    /// - **Recommended Values**:
331    ///   - Web uploads: 1,000 - 10,000 columns
332    ///   - Internal processing: 10,000 - 50,000 columns
333    ///   - Scientific data: Adjust based on requirements
334    ///
335    /// # Example
336    ///
337    /// ```
338    /// # use hedl_csv::FromCsvConfig;
339    /// // For processing wide scientific datasets
340    /// let config = FromCsvConfig {
341    ///     max_columns: 50_000,
342    ///     ..Default::default()
343    /// };
344    /// ```
345    pub max_columns: usize,
346
347    /// Maximum size of a single cell in bytes (default: 1MB).
348    ///
349    /// This security limit prevents "cell bomb" attacks where malicious CSV files
350    /// contain enormous individual cells that cause memory exhaustion.
351    ///
352    /// # Security Impact
353    ///
354    /// - **`DoS` Protection**: Prevents attackers from using 10MB+ cells
355    /// - **Memory Bound**: Each cell is read into memory as a String
356    /// - **Cumulative**: Multiple large cells multiply the impact
357    /// - **Recommended Values**:
358    ///   - Web uploads: 64KB - 1MB
359    ///   - Internal processing: 1MB - 10MB
360    ///   - Text-heavy data: Adjust based on requirements
361    ///
362    /// # Example
363    ///
364    /// ```
365    /// # use hedl_csv::FromCsvConfig;
366    /// // For processing long text fields (e.g., descriptions, comments)
367    /// let config = FromCsvConfig {
368    ///     max_cell_size: 5_242_880, // 5MB
369    ///     ..Default::default()
370    /// };
371    /// ```
372    pub max_cell_size: usize,
373
374    /// Maximum total CSV size in bytes after decompression (default: 100MB).
375    ///
376    /// This security limit prevents "decompression bomb" attacks where compressed
377    /// CSV files decompress to enormous sizes. A 1MB gzipped file could decompress
378    /// to 1GB+, bypassing file size checks.
379    ///
380    /// # Security Impact
381    ///
382    /// - **`DoS` Protection**: Prevents decompression bombs
383    /// - **Memory Bound**: Tracks total bytes read during parsing
384    /// - **Transparent**: Works even if CSV library handles decompression
385    /// - **Recommended Values**:
386    ///   - Web uploads: 10MB - 100MB
387    ///   - Internal processing: 100MB - 1GB
388    ///   - Big data: Adjust based on available RAM
389    ///
390    /// # Example
391    ///
392    /// ```
393    /// # use hedl_csv::FromCsvConfig;
394    /// // For processing large datasets on high-memory servers
395    /// let config = FromCsvConfig {
396    ///     max_total_size: 1_073_741_824, // 1GB
397    ///     ..Default::default()
398    /// };
399    /// ```
400    pub max_total_size: usize,
401
402    /// Maximum size of header row in bytes (default: 1MB).
403    ///
404    /// This security limit prevents "header bomb" attacks where malicious CSV files
405    /// have enormous column names or excessive total header size.
406    ///
407    /// # Security Impact
408    ///
409    /// - **`DoS` Protection**: Prevents huge column names (e.g., 1MB per column)
410    /// - **Memory Bound**: Limits memory for header parsing
411    /// - **Combined with `max_columns`**: Total size = `column_count` × `avg_name_length`
412    /// - **Recommended Values**:
413    ///   - Web uploads: 64KB - 1MB
414    ///   - Internal processing: 1MB - 10MB
415    ///   - Verbose column naming: Adjust based on requirements
416    ///
417    /// # Example
418    ///
419    /// ```
420    /// # use hedl_csv::FromCsvConfig;
421    /// // For datasets with very descriptive column names
422    /// let config = FromCsvConfig {
423    ///     max_header_size: 5_242_880, // 5MB
424    ///     ..Default::default()
425    /// };
426    /// ```
427    pub max_header_size: usize,
428}
429
430impl Default for FromCsvConfig {
431    fn default() -> Self {
432        Self {
433            delimiter: b',',
434            has_headers: true,
435            trim: true,
436            max_rows: DEFAULT_MAX_ROWS,
437            infer_schema: false,
438            sample_rows: 100,
439            list_key: None,
440            max_columns: DEFAULT_MAX_COLUMNS,
441            max_cell_size: DEFAULT_MAX_CELL_SIZE,
442            max_total_size: DEFAULT_MAX_TOTAL_SIZE,
443            max_header_size: DEFAULT_MAX_HEADER_SIZE,
444        }
445    }
446}
447
448impl FromCsvConfig {
449    /// Creates a config with NO security limits (use for trusted input only).
450    ///
451    /// # Security Warning
452    ///
453    /// This configuration disables ALL security limits. Only use this for:
454    /// - Trusted internal data sources
455    /// - Controlled batch processing environments
456    /// - Known-good CSV files
457    ///
458    /// **DO NOT** use this for:
459    /// - User uploads
460    /// - Web service inputs
461    /// - Untrusted data sources
462    ///
463    /// # Examples
464    ///
465    /// ```
466    /// # use hedl_csv::FromCsvConfig;
467    /// // For internal batch processing with trusted data
468    /// let config = FromCsvConfig::unlimited();
469    /// ```
470    #[must_use]
471    pub fn unlimited() -> Self {
472        Self {
473            max_rows: usize::MAX,
474            max_columns: usize::MAX,
475            max_cell_size: usize::MAX,
476            max_total_size: usize::MAX,
477            max_header_size: usize::MAX,
478            ..Default::default()
479        }
480    }
481
482    /// Creates a config with strict limits for untrusted input.
483    ///
484    /// # Security
485    ///
486    /// This configuration provides stricter limits suitable for:
487    /// - Web service uploads
488    /// - User-submitted CSV files
489    /// - Untrusted data sources
490    /// - Rate-limited APIs
491    ///
492    /// # Limits
493    ///
494    /// - `max_rows`: 1,000,000 (same as default)
495    /// - `max_columns`: 1,000 (stricter than default 10,000)
496    /// - `max_cell_size`: 64KB (stricter than default 1MB)
497    /// - `max_total_size`: 10MB (stricter than default 100MB)
498    /// - `max_header_size`: 64KB (stricter than default 1MB)
499    ///
500    /// # Examples
501    ///
502    /// ```
503    /// # use hedl_csv::FromCsvConfig;
504    /// // For user uploads in a web service
505    /// let config = FromCsvConfig::strict();
506    /// ```
507    #[must_use]
508    pub fn strict() -> Self {
509        Self {
510            max_rows: 1_000_000,
511            max_columns: 1_000,
512            max_cell_size: 65_536,
513            max_total_size: 10_485_760,
514            max_header_size: 65_536,
515            ..Default::default()
516        }
517    }
518}
519
520/// Parse CSV string into a HEDL document with default configuration.
521///
522/// This is the primary entry point for CSV parsing. It uses sensible defaults:
523/// - Comma delimiter
524/// - Headers expected in first row
525/// - Whitespace trimming enabled
526/// - 1 million row limit for security
527///
528/// # Arguments
529///
530/// * `csv` - The CSV string to parse
531/// * `type_name` - The HEDL type name for rows (e.g., "Person")
532/// * `schema` - Column names excluding the 'id' column (which is always first)
533///
534/// # Returns
535///
536/// A `Document` containing a single matrix list with the parsed data, or an error
537/// if parsing fails.
538///
539/// # Errors
540///
541/// Returns `HedlError` in the following cases:
542///
543/// - `Syntax`: Malformed CSV records or invalid UTF-8
544/// - `Schema`: Missing ID column or field count mismatch
545/// - `Semantic`: Empty ID field
546/// - `Security`: Row count exceeds maximum (default 1M rows)
547///
548/// # Type Inference
549///
550/// Values are automatically inferred from CSV text:
551///
552/// - Empty string or `~` → `Value::Null`
553/// - `true`/`false` → `Value::Bool`
554/// - Integer pattern → `Value::Int` (e.g., "42", "-123")
555/// - Float pattern → `Value::Float` (e.g., "3.14", "1.5e10")
556/// - Special floats: `NaN`, `Infinity`, `-Infinity`
557/// - `@id` or `@Type:id` → `Value::Reference`
558/// - `$(expr)` → `Value::Expression`
559/// - `[1,2,3]` → `Value::Tensor`
560/// - Otherwise → `Value::String`
561///
562/// # Examples
563///
564/// ## Basic Usage
565///
566/// ```
567/// use hedl_csv::from_csv;
568/// use hedl_core::Value;
569///
570/// let csv_data = "id,name,age\n1,Alice,30\n2,Bob,25";
571/// let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
572///
573/// // Access the parsed data
574/// let list = doc.get("persons").unwrap().as_list().unwrap();
575/// assert_eq!(list.rows.len(), 2);
576/// assert_eq!(list.rows[0].id, "1");
577/// ```
578///
579/// ## Mixed Type Inference
580///
581/// ```
582/// use hedl_csv::from_csv;
583/// use hedl_core::Value;
584///
585/// let csv_data = "id,value\n1,42\n2,3.14\n3,true\n4,hello";
586/// let doc = from_csv(csv_data, "Item", &["value"]).unwrap();
587///
588/// let list = doc.get("items").unwrap().as_list().unwrap();
589/// assert!(matches!(list.rows[0].fields[1], Value::Int(42)));
590/// assert!(matches!(list.rows[1].fields[1], Value::Float(f) if (f - 3.14).abs() < 0.001));
591/// assert!(matches!(list.rows[2].fields[1], Value::Bool(true)));
592/// assert!(matches!(list.rows[3].fields[1], Value::String(_)));
593/// ```
594///
595/// ## References
596///
597/// ```
598/// use hedl_csv::from_csv;
599///
600/// let csv_data = "id,owner\n1,@user1\n2,@User:alice";
601/// let doc = from_csv(csv_data, "Item", &["owner"]).unwrap();
602///
603/// let list = doc.get("items").unwrap().as_list().unwrap();
604/// let ref1 = list.rows[0].fields[1].as_reference().unwrap();
605/// assert_eq!(&*ref1.id, "user1");
606/// assert_eq!(ref1.type_name, None); // Local reference
607///
608/// let ref2 = list.rows[1].fields[1].as_reference().unwrap();
609/// assert_eq!(&*ref2.id, "alice");
610/// assert_eq!(ref2.type_name.as_deref(), Some("User")); // Qualified reference
611/// ```
612///
613/// # Performance
614///
615/// - **Streaming**: Processes CSV row-by-row to minimize memory usage
616/// - **Memory bound**: O(rows × columns) space complexity
617/// - **Time complexity**: O(rows × columns) with efficient parsing
618///
619/// For very large files, consider using `from_csv_reader` for file I/O or
620/// increasing `max_rows` via `from_csv_with_config`.
621///
622/// # See Also
623///
624/// - `from_csv_with_config` - For custom delimiters, row limits, etc.
625/// - `from_csv_reader` - For parsing from files or network streams
626pub fn from_csv(csv: &str, type_name: &str, schema: &[&str]) -> Result<Document> {
627    from_csv_with_config(csv, type_name, schema, FromCsvConfig::default())
628}
629
630/// Parse CSV string into a HEDL document with custom configuration.
631///
632/// This function provides full control over CSV parsing behavior through `FromCsvConfig`.
633///
634/// # Arguments
635///
636/// * `csv` - The CSV string to parse
637/// * `type_name` - The HEDL type name for rows
638/// * `schema` - Column names excluding the 'id' column
639/// * `config` - Configuration controlling delimiter, headers, trimming, and row limits
640///
641/// # Examples
642///
643/// ## Tab-Separated Values (TSV)
644///
645/// ```
646/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
647///
648/// let tsv_data = "id\tname\tage\n1\tAlice\t30";
649/// let config = FromCsvConfig {
650///     delimiter: b'\t',
651///     ..Default::default()
652/// };
653/// let doc = from_csv_with_config(tsv_data, "Person", &["name", "age"], config).unwrap();
654/// ```
655///
656/// ## Custom Row Limit
657///
658/// ```
659/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
660///
661/// let config = FromCsvConfig {
662///     max_rows: 10_000_000, // Allow up to 10M rows
663///     ..Default::default()
664/// };
665/// let csv_data = "id,value\n1,test";
666/// let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
667/// ```
668///
669/// ## Disable Whitespace Trimming
670///
671/// ```
672/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
673/// use hedl_core::Value;
674///
675/// let csv_data = "id,name\n1,  Alice  ";
676/// let config = FromCsvConfig {
677///     trim: false,
678///     ..Default::default()
679/// };
680/// let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
681///
682/// let list = doc.get("persons").unwrap().as_list().unwrap();
683/// assert_eq!(list.rows[0].fields[1], Value::String("  Alice  ".to_string().into()));
684/// ```
685///
686/// # See Also
687///
688/// - `from_csv` - Convenience function with default configuration
689/// - `from_csv_reader_with_config` - For streaming from files/network
690pub fn from_csv_with_config(
691    csv: &str,
692    type_name: &str,
693    schema: &[&str],
694    config: FromCsvConfig,
695) -> Result<Document> {
696    from_csv_reader_with_config(csv.as_bytes(), type_name, schema, config)
697}
698
699/// Parse CSV from a reader into a HEDL document with default configuration.
700///
701/// This function is useful for processing CSV files or network streams without
702/// loading the entire content into memory first.
703///
704/// # Arguments
705///
706/// * `reader` - Any type implementing `Read` (e.g., `File`, `TcpStream`, `&[u8]`)
707/// * `type_name` - The HEDL type name for rows
708/// * `schema` - Column names excluding the 'id' column
709///
710/// # Examples
711///
712/// ## Reading from a File
713///
714/// ```no_run
715/// use hedl_csv::from_csv_reader;
716/// use std::fs::File;
717///
718/// let file = File::open("data.csv").unwrap();
719/// let doc = from_csv_reader(file, "Person", &["name", "age"]).unwrap();
720/// ```
721///
722/// ## Reading from a Byte Slice
723///
724/// ```
725/// use hedl_csv::from_csv_reader;
726///
727/// let csv_bytes = b"id,name\n1,Alice";
728/// let doc = from_csv_reader(&csv_bytes[..], "Person", &["name"]).unwrap();
729/// ```
730///
731/// ## Reading from Standard Input
732///
733/// ```no_run
734/// use hedl_csv::from_csv_reader;
735/// use std::io;
736///
737/// let stdin = io::stdin();
738/// let doc = from_csv_reader(stdin.lock(), "Record", &["field1", "field2"]).unwrap();
739/// ```
740///
741/// # Performance
742///
743/// This function uses streaming I/O to minimize memory usage. The CSV data is
744/// processed row-by-row without buffering the entire file.
745///
746/// # See Also
747///
748/// - `from_csv_reader_with_config` - For custom delimiters and limits
749/// - `from_csv` - For parsing CSV strings
750pub fn from_csv_reader<R: Read>(reader: R, type_name: &str, schema: &[&str]) -> Result<Document> {
751    from_csv_reader_with_config(reader, type_name, schema, FromCsvConfig::default())
752}
753
754/// Inferred column type from sampling CSV data.
755#[derive(Debug, Clone, Copy, PartialEq, Eq)]
756enum ColumnType {
757    /// All sampled values are null/empty
758    Null,
759    /// All sampled values are "true" or "false"
760    Bool,
761    /// All sampled values parse as integers
762    Int,
763    /// All sampled values parse as floats (but not all as integers)
764    Float,
765    /// Default fallback for mixed or string data
766    String,
767}
768
769/// Infer the type of a single column from sampled values.
770///
771/// # Type Inference Rules
772///
773/// The function examines non-null values and determines the most specific type:
774///
775/// 1. If all values are null → `ColumnType::Null`
776/// 2. If all values are "true"/"false" → `ColumnType::Bool`
777/// 3. If all values parse as i64 → `ColumnType::Int`
778/// 4. If all values parse as f64 → `ColumnType::Float`
779/// 5. Otherwise → `ColumnType::String`
780///
781/// # Arguments
782///
783/// * `values` - Iterator over string values from a column
784///
785/// # Examples
786///
787/// ```text
788/// let values = vec!["1", "2", "3"];
789/// let col_type = infer_column_type(values.iter().map(|s| s.as_str()));
790/// assert_eq!(col_type, ColumnType::Int);
791/// ```
792fn infer_column_type<'a, I>(values: I) -> ColumnType
793where
794    I: Iterator<Item = &'a str>,
795{
796    let mut all_null = true;
797    let mut all_bool = true;
798    let mut all_int = true;
799    let mut all_float = true;
800
801    for value in values {
802        let trimmed = value.trim();
803
804        // Skip null values (don't affect type inference)
805        if trimmed.is_empty() || trimmed == "~" || trimmed == "null" {
806            continue;
807        }
808
809        all_null = false;
810
811        // Check bool
812        if trimmed != "true" && trimmed != "false" {
813            all_bool = false;
814        }
815
816        // Check int
817        if trimmed.parse::<i64>().is_err() {
818            all_int = false;
819        }
820
821        // Check float
822        if trimmed.parse::<f64>().is_err() {
823            all_float = false;
824        }
825
826        // Early exit if we know it's a string
827        if !all_bool && !all_int && !all_float {
828            return ColumnType::String;
829        }
830    }
831
832    // Determine type based on inference (most specific to least)
833    if all_null {
834        ColumnType::Null
835    } else if all_bool {
836        ColumnType::Bool
837    } else if all_int {
838        ColumnType::Int
839    } else if all_float {
840        ColumnType::Float
841    } else {
842        ColumnType::String
843    }
844}
845
846/// Infer types for all columns by sampling CSV records.
847///
848/// # Arguments
849///
850/// * `records` - Slice of CSV records (each record is a Vec<String>)
851/// * `sample_size` - Maximum number of records to sample
852///
853/// # Returns
854///
855/// A vector of `ColumnType` for each column in the CSV.
856///
857/// # Examples
858///
859/// ```text
860/// let records = vec![
861///     vec!["1".to_string(), "Alice".to_string(), "30".to_string()],
862///     vec!["2".to_string(), "Bob".to_string(), "25".to_string()],
863/// ];
864/// let types = infer_column_types(&records, 100);
865/// assert_eq!(types, vec![ColumnType::Int, ColumnType::String, ColumnType::Int]);
866/// ```
867fn infer_column_types(records: &[Vec<String>], sample_size: usize) -> Vec<ColumnType> {
868    if records.is_empty() {
869        return Vec::new();
870    }
871
872    let num_columns = records[0].len();
873    let sample_count = sample_size.min(records.len());
874
875    (0..num_columns)
876        .map(|col_idx| {
877            let column_values = records
878                .iter()
879                .take(sample_count)
880                .filter_map(|row| row.get(col_idx).map(std::string::String::as_str));
881
882            infer_column_type(column_values)
883        })
884        .collect()
885}
886
887/// Parse a CSV value using a specific inferred type.
888///
889/// This function forces type conversion based on the inferred schema,
890/// falling back to string on conversion failure.
891///
892/// # Arguments
893///
894/// * `field` - The string value to parse
895/// * `col_type` - The inferred column type
896///
897/// # Returns
898///
899/// A HEDL `Value` of the specified type, or `Value::String` if conversion fails.
900fn parse_csv_value_with_type(field: &str, col_type: ColumnType) -> Result<Value> {
901    let trimmed = field.trim();
902
903    // Always handle null values regardless of inferred type
904    if trimmed.is_empty() || trimmed == "~" {
905        return Ok(Value::Null);
906    }
907
908    match col_type {
909        ColumnType::Null => Ok(Value::Null),
910        ColumnType::Bool => {
911            if trimmed == "true" {
912                Ok(Value::Bool(true))
913            } else if trimmed == "false" {
914                Ok(Value::Bool(false))
915            } else {
916                // Fallback to string if not a valid bool
917                Ok(Value::String(field.to_string().into()))
918            }
919        }
920        ColumnType::Int => {
921            if let Ok(n) = trimmed.parse::<i64>() {
922                Ok(Value::Int(n))
923            } else {
924                // Fallback to string if not a valid int
925                Ok(Value::String(field.to_string().into()))
926            }
927        }
928        ColumnType::Float => {
929            if let Ok(f) = trimmed.parse::<f64>() {
930                Ok(Value::Float(f))
931            } else {
932                // Fallback to string if not a valid float
933                Ok(Value::String(field.to_string().into()))
934            }
935        }
936        ColumnType::String => {
937            // Use the original parse_csv_value for full type detection
938            // (handles references, expressions, tensors, etc.)
939            parse_csv_value(field)
940        }
941    }
942}
943
944/// Validate CSV headers against security limits.
945///
946/// This function checks:
947/// - Column count does not exceed `max_columns`
948/// - Total header size does not exceed `max_header_size`
949/// - Individual column name size does not exceed `max_cell_size`
950///
951/// # Arguments
952///
953/// * `headers` - The CSV header record
954/// * `config` - Configuration containing security limits
955///
956/// # Returns
957///
958/// `Ok(())` if all checks pass, otherwise an error.
959fn validate_headers(headers: &csv::StringRecord, config: &FromCsvConfig) -> Result<()> {
960    // Check column count
961    let column_count = headers.len();
962    if column_count > config.max_columns {
963        return Err(CsvError::Security {
964            limit_type: "column count".to_string(),
965            limit: config.max_columns,
966            actual: column_count,
967            message: format!(
968                "CSV has {} columns, exceeds limit of {}",
969                column_count, config.max_columns
970            ),
971        });
972    }
973
974    // Check total header size
975    let header_size: usize = headers.iter().map(str::len).sum();
976    if header_size > config.max_header_size {
977        return Err(CsvError::Security {
978            limit_type: "header size".to_string(),
979            limit: config.max_header_size,
980            actual: header_size,
981            message: format!(
982                "CSV header size {} bytes, exceeds limit of {} bytes",
983                header_size, config.max_header_size
984            ),
985        });
986    }
987
988    // Check for individual column name size (prevent single huge name)
989    for (i, header) in headers.iter().enumerate() {
990        if header.len() > config.max_cell_size {
991            // Safely create preview by finding the last complete character before byte 100
992            let preview = if header.len() > 100 {
993                // Find the last character boundary before position 100
994                let mut preview_end = 100;
995                while !header.is_char_boundary(preview_end) && preview_end > 0 {
996                    preview_end -= 1;
997                }
998                format!("{}...", &header[..preview_end])
999            } else {
1000                header.to_string()
1001            };
1002            return Err(CsvError::Security {
1003                limit_type: "column name size".to_string(),
1004                limit: config.max_cell_size,
1005                actual: header.len(),
1006                message: format!(
1007                    "Column name '{}' at index {} is {} bytes, exceeds cell size limit of {} bytes",
1008                    preview,
1009                    i,
1010                    header.len(),
1011                    config.max_cell_size
1012                ),
1013            });
1014        }
1015    }
1016
1017    Ok(())
1018}
1019
1020/// Validate a single cell against security limits.
1021///
1022/// This function checks that the cell size does not exceed `max_cell_size`.
1023///
1024/// # Arguments
1025///
1026/// * `cell` - The cell content to validate
1027/// * `row` - Row number (1-based, for error messages)
1028/// * `column` - Column index (0-based, for error messages)
1029/// * `config` - Configuration containing security limits
1030///
1031/// # Returns
1032///
1033/// `Ok(())` if the cell is within limits, otherwise an error.
1034fn validate_cell(cell: &str, row: usize, column: usize, config: &FromCsvConfig) -> Result<()> {
1035    if cell.len() > config.max_cell_size {
1036        // Safely create preview by finding the last complete character before byte 100
1037        let preview = if cell.len() > 100 {
1038            // Find the last character boundary before position 100
1039            let mut preview_end = 100;
1040            while !cell.is_char_boundary(preview_end) && preview_end > 0 {
1041                preview_end -= 1;
1042            }
1043            format!("{}...", &cell[..preview_end])
1044        } else {
1045            cell.to_string()
1046        };
1047        return Err(CsvError::Security {
1048            limit_type: "cell size".to_string(),
1049            limit: config.max_cell_size,
1050            actual: cell.len(),
1051            message: format!(
1052                "Cell at row {}, column {} is {} bytes, exceeds limit of {} bytes. Content preview: '{}'",
1053                row,
1054                column,
1055                cell.len(),
1056                config.max_cell_size,
1057                preview
1058            ),
1059        });
1060    }
1061    Ok(())
1062}
1063
1064/// Tracker for CSV size during parsing.
1065///
1066/// This struct tracks the total bytes read during CSV parsing to prevent
1067/// decompression bomb attacks.
1068struct CsvSizeTracker {
1069    bytes_read: usize,
1070    max_total_size: usize,
1071}
1072
1073impl CsvSizeTracker {
1074    /// Create a new size tracker with the specified maximum.
1075    fn new(max_total_size: usize) -> Self {
1076        Self {
1077            bytes_read: 0,
1078            max_total_size,
1079        }
1080    }
1081
1082    /// Track a record and check if the total size exceeds the limit.
1083    ///
1084    /// # Arguments
1085    ///
1086    /// * `record` - The CSV record to track
1087    ///
1088    /// # Returns
1089    ///
1090    /// `Ok(())` if within limits, otherwise an error.
1091    fn track_record(&mut self, record: &csv::StringRecord) -> Result<()> {
1092        let record_size: usize = record.iter().map(str::len).sum();
1093        self.bytes_read += record_size;
1094
1095        if self.bytes_read > self.max_total_size {
1096            return Err(CsvError::Security {
1097                limit_type: "total size".to_string(),
1098                limit: self.max_total_size,
1099                actual: self.bytes_read,
1100                message: format!(
1101                    "CSV total size {} bytes exceeds limit of {} bytes",
1102                    self.bytes_read, self.max_total_size
1103                ),
1104            });
1105        }
1106
1107        Ok(())
1108    }
1109
1110    /// Get the current total bytes read.
1111    #[allow(dead_code)]
1112    fn bytes_read(&self) -> usize {
1113        self.bytes_read
1114    }
1115}
1116
1117/// Parse CSV from a reader into a HEDL document with custom configuration.
1118///
1119/// This is the most flexible CSV parsing function, supporting both custom I/O sources
1120/// and custom parsing configuration.
1121///
1122/// # Arguments
1123///
1124/// * `reader` - Any type implementing `Read`
1125/// * `type_name` - The HEDL type name for rows
1126/// * `schema` - Column names excluding the 'id' column
1127/// * `config` - Configuration controlling all parsing behavior
1128///
1129/// # Examples
1130///
1131/// ## Large File with Custom Limit
1132///
1133/// ```no_run
1134/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
1135/// use std::fs::File;
1136///
1137/// let file = File::open("large_dataset.csv").unwrap();
1138/// let config = FromCsvConfig {
1139///     max_rows: 50_000_000, // 50M rows for high-memory server
1140///     ..Default::default()
1141/// };
1142/// let doc = from_csv_reader_with_config(file, "Record", &["value"], config).unwrap();
1143/// ```
1144///
1145/// ## TSV from Network Stream
1146///
1147/// ```no_run
1148/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
1149/// use std::net::TcpStream;
1150///
1151/// let stream = TcpStream::connect("example.com:8080").unwrap();
1152/// let config = FromCsvConfig {
1153///     delimiter: b'\t',
1154///     ..Default::default()
1155/// };
1156/// let doc = from_csv_reader_with_config(stream, "Data", &["col1", "col2"], config).unwrap();
1157/// ```
1158///
1159/// # Implementation Details
1160///
1161/// The function performs the following steps:
1162///
1163/// 1. Creates a CSV reader with the specified configuration
1164/// 2. Initializes a new HEDL document with version (1, 0)
1165/// 3. Constructs the full schema (ID column + provided columns)
1166/// 4. Registers the struct type in the document
1167/// 5. Iterates through CSV records:
1168///    - Checks row count against `max_rows` security limit
1169///    - Parses each field using type inference
1170///    - Validates field count matches schema
1171///    - Creates `Node` instances and adds to matrix list
1172/// 6. Inserts the completed matrix list into the document
1173///
1174/// # See Also
1175///
1176/// - `from_csv_with_config` - For parsing CSV strings
1177/// - `FromCsvConfig` - Configuration options documentation
1178pub fn from_csv_reader_with_config<R: Read>(
1179    reader: R,
1180    type_name: &str,
1181    schema: &[&str],
1182    config: FromCsvConfig,
1183) -> Result<Document> {
1184    let mut csv_reader = csv::ReaderBuilder::new()
1185        .delimiter(config.delimiter)
1186        .has_headers(config.has_headers)
1187        .trim(if config.trim {
1188            csv::Trim::All
1189        } else {
1190            csv::Trim::None
1191        })
1192        .from_reader(reader);
1193
1194    let mut doc = Document::new((1, 0));
1195
1196    // Create schema with 'id' column
1197    let mut full_schema = vec!["id".to_string()];
1198    full_schema.extend(schema.iter().map(|s| (*s).to_string()));
1199
1200    // Register the struct type
1201    doc.structs
1202        .insert(type_name.to_string(), full_schema.clone());
1203
1204    // Create matrix list
1205    let mut matrix_list = MatrixList::new(type_name, full_schema.clone());
1206
1207    // VALIDATE HEADERS if has_headers is enabled
1208    let headers = csv_reader.headers().map_err(|e| CsvError::ParseError {
1209        line: 0,
1210        message: e.to_string(),
1211    })?;
1212
1213    validate_headers(headers, &config)?;
1214
1215    // Initialize size tracker
1216    let mut size_tracker = CsvSizeTracker::new(config.max_total_size);
1217
1218    // Track header size
1219    let header_size: usize = headers.iter().map(str::len).sum();
1220    size_tracker.bytes_read += header_size;
1221
1222    // If schema inference is enabled, collect records first
1223    let _inferred_types = if config.infer_schema {
1224        // Collect records for sampling
1225        let mut all_records = Vec::new();
1226        for (record_idx, result) in csv_reader.records().enumerate() {
1227            // Security: Limit row count to prevent memory exhaustion
1228            if record_idx >= config.max_rows {
1229                return Err(CsvError::SecurityLimit {
1230                    limit: config.max_rows,
1231                    actual: record_idx + 1,
1232                });
1233            }
1234
1235            let record = result.map_err(|e| CsvError::ParseError {
1236                line: record_idx + 1,
1237                message: e.to_string(),
1238            })?;
1239
1240            if record.is_empty() {
1241                continue;
1242            }
1243
1244            // VALIDATE TOTAL SIZE
1245            size_tracker.track_record(&record)?;
1246
1247            // VALIDATE EACH CELL
1248            for (col_idx, cell) in record.iter().enumerate() {
1249                validate_cell(cell, record_idx + 1, col_idx, &config)?;
1250            }
1251
1252            // Convert StringRecord to Vec<String>
1253            let row: Vec<String> = record
1254                .iter()
1255                .map(std::string::ToString::to_string)
1256                .collect();
1257            all_records.push(row);
1258        }
1259
1260        // Infer column types from sampled records
1261        let types = infer_column_types(&all_records, config.sample_rows);
1262
1263        // Process all records with inferred types
1264        for (record_idx, row) in all_records.iter().enumerate() {
1265            // First column is the ID
1266            let id = row
1267                .first()
1268                .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
1269
1270            if id.is_empty() {
1271                return Err(CsvError::EmptyId {
1272                    row: record_idx + 1,
1273                });
1274            }
1275
1276            // Parse ALL fields (including ID) with inferred types
1277            let mut fields = Vec::new();
1278            for (field_idx, field) in row.iter().enumerate() {
1279                let col_type = types.get(field_idx).copied().unwrap_or(ColumnType::String);
1280                let value = parse_csv_value_with_type(field, col_type).map_err(|e| {
1281                    e.with_context(format!(
1282                        "in column '{}' at line {}",
1283                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
1284                        record_idx + 1
1285                    ))
1286                })?;
1287                fields.push(value);
1288            }
1289
1290            // Check field count matches full schema (including ID)
1291            if fields.len() != full_schema.len() {
1292                return Err(CsvError::WidthMismatch {
1293                    expected: full_schema.len(),
1294                    actual: fields.len(),
1295                    row: record_idx + 1,
1296                });
1297            }
1298
1299            let node = Node::new(type_name, id, fields);
1300            matrix_list.add_row(node);
1301        }
1302
1303        types
1304    } else {
1305        // Standard parsing without schema inference
1306        for (record_idx, result) in csv_reader.records().enumerate() {
1307            // Security: Limit row count to prevent memory exhaustion
1308            if record_idx >= config.max_rows {
1309                return Err(CsvError::SecurityLimit {
1310                    limit: config.max_rows,
1311                    actual: record_idx + 1,
1312                });
1313            }
1314
1315            let record = result.map_err(|e| CsvError::ParseError {
1316                line: record_idx + 1,
1317                message: e.to_string(),
1318            })?;
1319
1320            if record.is_empty() {
1321                continue;
1322            }
1323
1324            // VALIDATE TOTAL SIZE
1325            size_tracker.track_record(&record)?;
1326
1327            // VALIDATE EACH CELL
1328            for (col_idx, cell) in record.iter().enumerate() {
1329                validate_cell(cell, record_idx + 1, col_idx, &config)?;
1330            }
1331
1332            // First column is the ID
1333            let id = record
1334                .get(0)
1335                .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
1336
1337            if id.is_empty() {
1338                return Err(CsvError::EmptyId {
1339                    row: record_idx + 1,
1340                });
1341            }
1342
1343            // Parse ALL fields (including ID) per SPEC
1344            let mut fields = Vec::new();
1345            for (field_idx, field) in record.iter().enumerate() {
1346                let value = parse_csv_value(field).map_err(|e| {
1347                    e.with_context(format!(
1348                        "in column '{}' at line {}",
1349                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
1350                        record_idx + 1
1351                    ))
1352                })?;
1353                fields.push(value);
1354            }
1355
1356            // Check field count matches full schema (including ID)
1357            if fields.len() != full_schema.len() {
1358                return Err(CsvError::WidthMismatch {
1359                    expected: full_schema.len(),
1360                    actual: fields.len(),
1361                    row: record_idx + 1,
1362                });
1363            }
1364
1365            let node = Node::new(type_name, id, fields);
1366            matrix_list.add_row(node);
1367        }
1368
1369        Vec::new()
1370    };
1371
1372    // Add matrix list to document with custom or default key
1373    let list_key = config
1374        .list_key
1375        .unwrap_or_else(|| format!("{}s", type_name.to_lowercase()));
1376
1377    doc.root.insert(list_key, Item::List(matrix_list));
1378
1379    Ok(doc)
1380}
1381
1382/// Parse a CSV field value into a HEDL Value.
1383///
1384/// Type inference rules:
1385/// - Empty string → Null
1386/// - "true" or "false" → Bool
1387/// - Integer pattern → Int
1388/// - Float pattern → Float
1389/// - Reference pattern (@...) → Reference
1390/// - Expression pattern $(...) → Expression
1391/// - Otherwise → String
1392fn parse_csv_value(field: &str) -> Result<Value> {
1393    let trimmed = field.trim();
1394
1395    // Empty or null
1396    if trimmed.is_empty() || trimmed == "~" {
1397        return Ok(Value::Null);
1398    }
1399
1400    // Boolean
1401    if trimmed == "true" {
1402        return Ok(Value::Bool(true));
1403    }
1404    if trimmed == "false" {
1405        return Ok(Value::Bool(false));
1406    }
1407
1408    // Special float values
1409    match trimmed {
1410        "NaN" => return Ok(Value::Float(f64::NAN)),
1411        "Infinity" => return Ok(Value::Float(f64::INFINITY)),
1412        "-Infinity" => return Ok(Value::Float(f64::NEG_INFINITY)),
1413        _ => {}
1414    }
1415
1416    // Reference
1417    if trimmed.starts_with('@') {
1418        return parse_reference(trimmed);
1419    }
1420
1421    // Expression
1422    if trimmed.starts_with("$(") && trimmed.ends_with(')') {
1423        let expr = parse_expression_token(trimmed).map_err(|e| CsvError::ParseError {
1424            line: 0,
1425            message: format!("Invalid expression: {e}"),
1426        })?;
1427        return Ok(Value::Expression(Box::new(expr)));
1428    }
1429
1430    // Try integer
1431    if let Ok(n) = trimmed.parse::<i64>() {
1432        return Ok(Value::Int(n));
1433    }
1434
1435    // Try float
1436    if let Ok(f) = trimmed.parse::<f64>() {
1437        return Ok(Value::Float(f));
1438    }
1439
1440    // Tensor literal (starts with '[' and ends with ']')
1441    if trimmed.starts_with('[') && trimmed.ends_with(']') {
1442        if let Ok(tensor) = parse_tensor(trimmed) {
1443            return Ok(Value::Tensor(Box::new(tensor)));
1444        }
1445        // If parsing fails, fall through to string
1446    }
1447
1448    // Default to string
1449    Ok(Value::String(field.to_string().into()))
1450}
1451
1452/// Parse a reference string (e.g., "@user1" or "@User:user1").
1453fn parse_reference(s: &str) -> Result<Value> {
1454    let without_at = &s[1..];
1455
1456    if let Some(colon_pos) = without_at.find(':') {
1457        // Qualified reference: @Type:id
1458        let type_name = &without_at[..colon_pos];
1459        let id = &without_at[colon_pos + 1..];
1460
1461        if type_name.is_empty() || id.is_empty() {
1462            return Err(CsvError::ParseError {
1463                line: 0,
1464                message: format!("Invalid reference format: {s}"),
1465            });
1466        }
1467
1468        Ok(Value::Reference(hedl_core::Reference::qualified(
1469            type_name, id,
1470        )))
1471    } else {
1472        // Local reference: @id
1473        if without_at.is_empty() {
1474            return Err(CsvError::ParseError {
1475                line: 0,
1476                message: "Empty reference ID".to_string(),
1477            });
1478        }
1479
1480        Ok(Value::Reference(hedl_core::Reference::local(without_at)))
1481    }
1482}
1483
1484#[cfg(test)]
1485mod tests {
1486    use super::*;
1487    use hedl_core::lex::Tensor;
1488    use hedl_test::expr_value;
1489
1490    // ==================== FromCsvConfig tests ====================
1491
1492    #[test]
1493    fn test_from_csv_config_default() {
1494        let config = FromCsvConfig::default();
1495        assert_eq!(config.delimiter, b',');
1496        assert!(config.has_headers);
1497        assert!(config.trim);
1498        assert_eq!(config.max_rows, DEFAULT_MAX_ROWS);
1499    }
1500
1501    #[test]
1502    fn test_from_csv_config_debug() {
1503        let config = FromCsvConfig::default();
1504        let debug = format!("{config:?}");
1505        assert!(debug.contains("FromCsvConfig"));
1506        assert!(debug.contains("delimiter"));
1507        assert!(debug.contains("has_headers"));
1508        assert!(debug.contains("trim"));
1509    }
1510
1511    #[test]
1512    fn test_from_csv_config_clone() {
1513        let config = FromCsvConfig {
1514            delimiter: b'\t',
1515            has_headers: false,
1516            trim: false,
1517            max_rows: 500_000,
1518            infer_schema: false,
1519            sample_rows: 100,
1520            list_key: None,
1521            max_columns: 5_000,
1522            max_cell_size: 2_000_000,
1523            max_total_size: 200_000_000,
1524            max_header_size: 2_000_000,
1525        };
1526        let cloned = config.clone();
1527        assert_eq!(cloned.delimiter, b'\t');
1528        assert!(!cloned.has_headers);
1529        assert!(!cloned.trim);
1530        assert_eq!(cloned.max_rows, 500_000);
1531        assert!(!cloned.infer_schema);
1532        assert_eq!(cloned.sample_rows, 100);
1533        assert_eq!(cloned.list_key, None);
1534        assert_eq!(cloned.max_columns, 5_000);
1535        assert_eq!(cloned.max_cell_size, 2_000_000);
1536        assert_eq!(cloned.max_total_size, 200_000_000);
1537        assert_eq!(cloned.max_header_size, 2_000_000);
1538    }
1539
1540    #[test]
1541    fn test_from_csv_config_all_options() {
1542        let config = FromCsvConfig {
1543            delimiter: b';',
1544            has_headers: true,
1545            trim: true,
1546            max_rows: 2_000_000,
1547            infer_schema: true,
1548            sample_rows: 200,
1549            list_key: Some("custom".to_string()),
1550            max_columns: 15_000,
1551            max_cell_size: 3_000_000,
1552            max_total_size: 300_000_000,
1553            max_header_size: 3_000_000,
1554        };
1555        assert_eq!(config.delimiter, b';');
1556        assert!(config.has_headers);
1557        assert!(config.trim);
1558        assert_eq!(config.max_rows, 2_000_000);
1559        assert!(config.infer_schema);
1560        assert_eq!(config.sample_rows, 200);
1561        assert_eq!(config.list_key, Some("custom".to_string()));
1562        assert_eq!(config.max_columns, 15_000);
1563        assert_eq!(config.max_cell_size, 3_000_000);
1564        assert_eq!(config.max_total_size, 300_000_000);
1565        assert_eq!(config.max_header_size, 3_000_000);
1566    }
1567
1568    #[test]
1569    fn test_max_rows_limit_enforcement() {
1570        // Create CSV with exactly max_rows + 1 rows
1571        let mut csv_data = String::from("id,value\n");
1572        let max_rows = 100;
1573        for i in 0..=max_rows {
1574            csv_data.push_str(&format!("{i},test{i}\n"));
1575        }
1576
1577        let config = FromCsvConfig {
1578            max_rows,
1579            infer_schema: false,
1580            sample_rows: 100,
1581            ..Default::default()
1582        };
1583
1584        let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1585        assert!(result.is_err());
1586        let err = result.unwrap_err();
1587        assert!(matches!(err, CsvError::SecurityLimit { .. }));
1588        assert!(err.to_string().contains("Security limit"));
1589        assert!(err.to_string().contains(&max_rows.to_string()));
1590    }
1591
1592    #[test]
1593    fn test_max_rows_limit_not_exceeded() {
1594        // Create CSV with exactly max_rows rows
1595        let mut csv_data = String::from("id,value\n");
1596        let max_rows = 100;
1597        for i in 0..(max_rows - 1) {
1598            csv_data.push_str(&format!("{i},test{i}\n"));
1599        }
1600
1601        let config = FromCsvConfig {
1602            max_rows,
1603            infer_schema: false,
1604            sample_rows: 100,
1605            ..Default::default()
1606        };
1607
1608        let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1609        assert!(result.is_ok());
1610        let doc = result.unwrap();
1611        let list = doc.get("items").unwrap().as_list().unwrap();
1612        assert_eq!(list.rows.len(), max_rows - 1);
1613    }
1614
1615    // ==================== from_csv basic tests ====================
1616
1617    #[test]
1618    fn test_from_csv_basic() {
1619        let csv_data = "id,name,age,active\n1,Alice,30,true\n2,Bob,25,false\n";
1620        let doc = from_csv(csv_data, "Person", &["name", "age", "active"]).unwrap();
1621
1622        // Check document structure
1623        assert_eq!(doc.version, (1, 0));
1624
1625        // Check schema registration
1626        let schema = doc.get_schema("Person").unwrap();
1627        assert_eq!(schema, &["id", "name", "age", "active"]);
1628
1629        // Check matrix list
1630        let item = doc.get("persons").unwrap();
1631        let list = item.as_list().unwrap();
1632        assert_eq!(list.type_name, "Person");
1633        assert_eq!(list.rows.len(), 2);
1634
1635        // Check first row
1636        let row1 = &list.rows[0];
1637        assert_eq!(row1.id, "1");
1638        assert_eq!(row1.fields.len(), schema.len()); // schema includes ID
1639        assert_eq!(row1.fields[0], Value::Int(1)); // ID field
1640        assert_eq!(row1.fields[1], Value::String("Alice".into()));
1641        assert_eq!(row1.fields[2], Value::Int(30));
1642        assert_eq!(row1.fields[3], Value::Bool(true));
1643
1644        // Check second row
1645        let row2 = &list.rows[1];
1646        assert_eq!(row2.id, "2");
1647        assert_eq!(row2.fields.len(), schema.len()); // schema includes ID
1648        assert_eq!(row2.fields[0], Value::Int(2)); // ID field
1649        assert_eq!(row2.fields[1], Value::String("Bob".into()));
1650        assert_eq!(row2.fields[2], Value::Int(25));
1651        assert_eq!(row2.fields[3], Value::Bool(false));
1652    }
1653
1654    #[test]
1655    fn test_from_csv_without_headers() {
1656        let csv_data = "1,Alice,30\n2,Bob,25\n";
1657        let config = FromCsvConfig {
1658            has_headers: false,
1659            ..Default::default()
1660        };
1661        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1662
1663        let item = doc.get("persons").unwrap();
1664        let list = item.as_list().unwrap();
1665        assert_eq!(list.rows.len(), 2);
1666    }
1667
1668    #[test]
1669    fn test_from_csv_custom_delimiter() {
1670        let csv_data = "id\tname\tage\n1\tAlice\t30\n2\tBob\t25\n";
1671        let config = FromCsvConfig {
1672            delimiter: b'\t',
1673            ..Default::default()
1674        };
1675        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1676
1677        let item = doc.get("persons").unwrap();
1678        let list = item.as_list().unwrap();
1679        assert_eq!(list.rows.len(), 2);
1680    }
1681
1682    #[test]
1683    fn test_from_csv_semicolon_delimiter() {
1684        let csv_data = "id;name;age\n1;Alice;30\n";
1685        let config = FromCsvConfig {
1686            delimiter: b';',
1687            ..Default::default()
1688        };
1689        let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1690
1691        let item = doc.get("persons").unwrap();
1692        let list = item.as_list().unwrap();
1693        assert_eq!(list.rows.len(), 1);
1694        assert_eq!(list.rows[0].fields[1], Value::String("Alice".into()));
1695    }
1696
1697    #[test]
1698    fn test_from_csv_empty_file() {
1699        let csv_data = "id,name\n";
1700        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1701
1702        let item = doc.get("persons").unwrap();
1703        let list = item.as_list().unwrap();
1704        assert!(list.rows.is_empty());
1705    }
1706
1707    #[test]
1708    fn test_from_csv_single_row() {
1709        let csv_data = "id,name\n1,Alice\n";
1710        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1711
1712        let item = doc.get("persons").unwrap();
1713        let list = item.as_list().unwrap();
1714        assert_eq!(list.rows.len(), 1);
1715    }
1716
1717    // ==================== parse_csv_value tests ====================
1718
1719    #[test]
1720    fn test_parse_csv_value_null_empty() {
1721        assert_eq!(parse_csv_value("").unwrap(), Value::Null);
1722    }
1723
1724    #[test]
1725    fn test_parse_csv_value_null_tilde() {
1726        assert_eq!(parse_csv_value("~").unwrap(), Value::Null);
1727    }
1728
1729    #[test]
1730    fn test_parse_csv_value_null_whitespace() {
1731        assert_eq!(parse_csv_value("   ").unwrap(), Value::Null);
1732    }
1733
1734    #[test]
1735    fn test_parse_csv_value_bool_true() {
1736        assert_eq!(parse_csv_value("true").unwrap(), Value::Bool(true));
1737    }
1738
1739    #[test]
1740    fn test_parse_csv_value_bool_false() {
1741        assert_eq!(parse_csv_value("false").unwrap(), Value::Bool(false));
1742    }
1743
1744    #[test]
1745    fn test_parse_csv_value_int_positive() {
1746        assert_eq!(parse_csv_value("42").unwrap(), Value::Int(42));
1747    }
1748
1749    #[test]
1750    fn test_parse_csv_value_int_negative() {
1751        assert_eq!(parse_csv_value("-123").unwrap(), Value::Int(-123));
1752    }
1753
1754    #[test]
1755    fn test_parse_csv_value_int_zero() {
1756        assert_eq!(parse_csv_value("0").unwrap(), Value::Int(0));
1757    }
1758
1759    #[test]
1760    fn test_parse_csv_value_int_large() {
1761        assert_eq!(
1762            parse_csv_value("9223372036854775807").unwrap(),
1763            Value::Int(i64::MAX)
1764        );
1765    }
1766
1767    #[test]
1768    fn test_parse_csv_value_float_positive() {
1769        assert_eq!(parse_csv_value("3.25").unwrap(), Value::Float(3.25));
1770    }
1771
1772    #[test]
1773    fn test_parse_csv_value_float_negative() {
1774        assert_eq!(parse_csv_value("-2.5").unwrap(), Value::Float(-2.5));
1775    }
1776
1777    #[test]
1778    fn test_parse_csv_value_float_zero() {
1779        assert_eq!(parse_csv_value("0.0").unwrap(), Value::Float(0.0));
1780    }
1781
1782    #[test]
1783    fn test_parse_csv_value_float_scientific() {
1784        let val = parse_csv_value("1.5e10").unwrap();
1785        if let Value::Float(f) = val {
1786            assert!((f - 1.5e10).abs() < 1e5);
1787        } else {
1788            panic!("Expected float");
1789        }
1790    }
1791
1792    #[test]
1793    fn test_parse_csv_value_string() {
1794        assert_eq!(
1795            parse_csv_value("hello").unwrap(),
1796            Value::String("hello".into())
1797        );
1798    }
1799
1800    #[test]
1801    fn test_parse_csv_value_string_with_spaces() {
1802        assert_eq!(
1803            parse_csv_value("  hello world  ").unwrap(),
1804            Value::String("  hello world  ".into())
1805        );
1806    }
1807
1808    #[test]
1809    fn test_parse_csv_value_string_numeric_looking() {
1810        // Strings that look like numbers but have leading zeros
1811        assert_eq!(
1812            parse_csv_value("007").unwrap(),
1813            Value::Int(7) // Parsed as int
1814        );
1815    }
1816
1817    // ==================== Special float values ====================
1818
1819    #[test]
1820    fn test_parse_csv_value_nan() {
1821        let nan = parse_csv_value("NaN").unwrap();
1822        assert!(matches!(nan, Value::Float(f) if f.is_nan()));
1823    }
1824
1825    #[test]
1826    fn test_parse_csv_value_infinity() {
1827        let inf = parse_csv_value("Infinity").unwrap();
1828        assert_eq!(inf, Value::Float(f64::INFINITY));
1829    }
1830
1831    #[test]
1832    fn test_parse_csv_value_neg_infinity() {
1833        let neg_inf = parse_csv_value("-Infinity").unwrap();
1834        assert_eq!(neg_inf, Value::Float(f64::NEG_INFINITY));
1835    }
1836
1837    // ==================== Reference tests ====================
1838
1839    #[test]
1840    fn test_parse_csv_value_reference_local() {
1841        let ref_val = parse_csv_value("@user1").unwrap();
1842        if let Value::Reference(r) = ref_val {
1843            assert_eq!(&*r.id, "user1");
1844            assert_eq!(r.type_name, None);
1845        } else {
1846            panic!("Expected reference");
1847        }
1848    }
1849
1850    #[test]
1851    fn test_parse_csv_value_reference_qualified() {
1852        let ref_val = parse_csv_value("@User:user1").unwrap();
1853        if let Value::Reference(r) = ref_val {
1854            assert_eq!(&*r.id, "user1");
1855            assert_eq!(r.type_name.as_deref(), Some("User"));
1856        } else {
1857            panic!("Expected reference");
1858        }
1859    }
1860
1861    #[test]
1862    fn test_parse_csv_value_reference_with_dashes() {
1863        let ref_val = parse_csv_value("@my-item-123").unwrap();
1864        if let Value::Reference(r) = ref_val {
1865            assert_eq!(&*r.id, "my-item-123");
1866        } else {
1867            panic!("Expected reference");
1868        }
1869    }
1870
1871    #[test]
1872    fn test_parse_reference_empty_error() {
1873        let result = parse_reference("@");
1874        assert!(result.is_err());
1875        assert!(result
1876            .unwrap_err()
1877            .to_string()
1878            .contains("Empty reference ID"));
1879    }
1880
1881    #[test]
1882    fn test_parse_reference_empty_type_error() {
1883        let result = parse_reference("@:id");
1884        assert!(result.is_err());
1885        assert!(result
1886            .unwrap_err()
1887            .to_string()
1888            .contains("Invalid reference format"));
1889    }
1890
1891    #[test]
1892    fn test_parse_reference_empty_id_error() {
1893        let result = parse_reference("@Type:");
1894        assert!(result.is_err());
1895        assert!(result
1896            .unwrap_err()
1897            .to_string()
1898            .contains("Invalid reference format"));
1899    }
1900
1901    // ==================== Expression tests ====================
1902
1903    #[test]
1904    fn test_parse_csv_value_expression_identifier() {
1905        let expr = parse_csv_value("$(foo)").unwrap();
1906        assert_eq!(expr, expr_value("foo"));
1907    }
1908
1909    #[test]
1910    fn test_parse_csv_value_expression_call() {
1911        let expr = parse_csv_value("$(add(x, y))").unwrap();
1912        assert_eq!(expr, expr_value("add(x, y)"));
1913    }
1914
1915    #[test]
1916    fn test_parse_csv_value_expression_nested() {
1917        let expr = parse_csv_value("$(outer(inner(x)))").unwrap();
1918        if let Value::Expression(e) = expr {
1919            assert_eq!(e.to_string(), "outer(inner(x))");
1920        } else {
1921            panic!("Expected expression");
1922        }
1923    }
1924
1925    // ==================== Tensor tests ====================
1926
1927    #[test]
1928    fn test_parse_csv_value_tensor_1d() {
1929        let val = parse_csv_value("[1, 2, 3]").unwrap();
1930        if let Value::Tensor(tensor) = val {
1931            if let Tensor::Array(arr) = tensor.as_ref() {
1932                assert_eq!(arr.len(), 3);
1933            } else {
1934                panic!("Expected tensor array");
1935            }
1936        } else {
1937            panic!("Expected tensor");
1938        }
1939    }
1940
1941    #[test]
1942    fn test_parse_csv_value_tensor_2d() {
1943        let val = parse_csv_value("[[1, 2], [3, 4]]").unwrap();
1944        if let Value::Tensor(tensor) = val {
1945            if let Tensor::Array(outer) = tensor.as_ref() {
1946                assert_eq!(outer.len(), 2);
1947                if let Tensor::Array(inner) = &outer[0] {
1948                    assert_eq!(inner.len(), 2);
1949                } else {
1950                    panic!("Expected nested array");
1951                }
1952            } else {
1953                panic!("Expected tensor array");
1954            }
1955        } else {
1956            panic!("Expected tensor");
1957        }
1958    }
1959
1960    #[test]
1961    fn test_parse_csv_value_tensor_empty_is_string() {
1962        // Empty tensors are not valid in HEDL (must have at least one element)
1963        // So "[]" falls through to being treated as a string
1964        let val = parse_csv_value("[]").unwrap();
1965        assert_eq!(val, Value::String("[]".into()));
1966    }
1967
1968    // ==================== Error cases ====================
1969
1970    #[test]
1971    fn test_empty_id_error() {
1972        let csv_data = "id,name\n,Alice\n";
1973        let result = from_csv(csv_data, "Person", &["name"]);
1974        assert!(result.is_err());
1975        assert!(matches!(result.unwrap_err(), CsvError::EmptyId { .. }));
1976    }
1977
1978    #[test]
1979    fn test_mismatched_field_count() {
1980        let csv_data = "id,name,age\n1,Alice\n";
1981        let result = from_csv(csv_data, "Person", &["name", "age"]);
1982        assert!(result.is_err());
1983        // CSV parser returns Syntax error for malformed records
1984        assert!(matches!(result.unwrap_err(), CsvError::ParseError { .. }));
1985    }
1986
1987    // ==================== Whitespace handling ====================
1988
1989    #[test]
1990    fn test_whitespace_trimming_enabled() {
1991        let csv_data = "id,name,age\n1,  Alice  ,  30  \n";
1992        let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
1993
1994        let item = doc.get("persons").unwrap();
1995        let list = item.as_list().unwrap();
1996        let row = &list.rows[0];
1997
1998        assert_eq!(row.fields[0], Value::Int(1)); // ID field
1999        assert_eq!(row.fields[1], Value::String("Alice".into()));
2000        assert_eq!(row.fields[2], Value::Int(30));
2001    }
2002
2003    #[test]
2004    fn test_whitespace_trimming_disabled() {
2005        let csv_data = "id,name\n1,  Alice  \n";
2006        let config = FromCsvConfig {
2007            trim: false,
2008            ..Default::default()
2009        };
2010        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2011
2012        let item = doc.get("persons").unwrap();
2013        let list = item.as_list().unwrap();
2014        // With trim disabled, whitespace is preserved
2015        assert_eq!(list.rows[0].fields[1], Value::String("  Alice  ".into()));
2016    }
2017
2018    // ==================== from_csv_reader tests ====================
2019
2020    #[test]
2021    fn test_from_csv_reader_basic() {
2022        let csv_data = "id,name\n1,Alice\n".as_bytes();
2023        let doc = from_csv_reader(csv_data, "Person", &["name"]).unwrap();
2024
2025        let item = doc.get("persons").unwrap();
2026        let list = item.as_list().unwrap();
2027        assert_eq!(list.rows.len(), 1);
2028    }
2029
2030    #[test]
2031    fn test_from_csv_reader_with_config() {
2032        let csv_data = "1\tAlice\n".as_bytes();
2033        let config = FromCsvConfig {
2034            delimiter: b'\t',
2035            has_headers: false,
2036            trim: true,
2037            ..Default::default()
2038        };
2039        let doc = from_csv_reader_with_config(csv_data, "Person", &["name"], config).unwrap();
2040
2041        let item = doc.get("persons").unwrap();
2042        let list = item.as_list().unwrap();
2043        assert_eq!(list.rows.len(), 1);
2044    }
2045
2046    // ==================== Type naming tests ====================
2047
2048    #[test]
2049    fn test_type_naming_singularization() {
2050        let csv_data = "id,name\n1,Alice\n";
2051        let doc = from_csv(csv_data, "User", &["name"]).unwrap();
2052
2053        // Matrix list should use "users" as key (lowercase + pluralized)
2054        let item = doc.get("users").unwrap();
2055        let list = item.as_list().unwrap();
2056        assert_eq!(list.type_name, "User");
2057    }
2058
2059    // ==================== Quoted fields ====================
2060
2061    #[test]
2062    fn test_quoted_fields() {
2063        let csv_data = "id,name,bio\n1,Alice,\"Hello, World\"\n";
2064        let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
2065
2066        let item = doc.get("persons").unwrap();
2067        let list = item.as_list().unwrap();
2068        assert_eq!(list.rows[0].fields[2], Value::String("Hello, World".into()));
2069    }
2070
2071    #[test]
2072    fn test_quoted_fields_with_newline() {
2073        let csv_data = "id,name,bio\n1,Alice,\"Line 1\nLine 2\"\n";
2074        let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
2075
2076        let item = doc.get("persons").unwrap();
2077        let list = item.as_list().unwrap();
2078        assert_eq!(
2079            list.rows[0].fields[2],
2080            Value::String("Line 1\nLine 2".into())
2081        );
2082    }
2083
2084    #[test]
2085    fn test_quoted_fields_with_quotes() {
2086        let csv_data = "id,name\n1,\"Alice \"\"Bob\"\" Smith\"\n";
2087        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2088
2089        let item = doc.get("persons").unwrap();
2090        let list = item.as_list().unwrap();
2091        assert_eq!(
2092            list.rows[0].fields[1],
2093            Value::String("Alice \"Bob\" Smith".into())
2094        );
2095    }
2096
2097    // ==================== Edge cases ====================
2098
2099    #[test]
2100    fn test_unicode_values() {
2101        let csv_data = "id,name\n1,héllo 世界\n";
2102        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2103
2104        let item = doc.get("persons").unwrap();
2105        let list = item.as_list().unwrap();
2106        assert_eq!(list.rows[0].fields[1], Value::String("héllo 世界".into()));
2107    }
2108
2109    #[test]
2110    fn test_string_id() {
2111        let csv_data = "id,name\nabc,Alice\n";
2112        let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2113
2114        let item = doc.get("persons").unwrap();
2115        let list = item.as_list().unwrap();
2116        assert_eq!(list.rows[0].id, "abc");
2117        assert_eq!(list.rows[0].fields[0], Value::String("abc".into()));
2118    }
2119
2120    #[test]
2121    fn test_many_columns() {
2122        let csv_data = "id,a,b,c,d,e\n1,2,3,4,5,6\n";
2123        let doc = from_csv(csv_data, "Item", &["a", "b", "c", "d", "e"]).unwrap();
2124
2125        let item = doc.get("items").unwrap();
2126        let list = item.as_list().unwrap();
2127        assert_eq!(list.schema.len(), 6); // id + 5 columns
2128        assert_eq!(list.rows[0].fields.len(), 6);
2129    }
2130
2131    // ==================== Custom list_key tests ====================
2132
2133    #[test]
2134    fn test_custom_list_key_basic() {
2135        let csv_data = "id,name\n1,Alice\n";
2136        let config = FromCsvConfig {
2137            list_key: Some("people".to_string()),
2138            ..Default::default()
2139        };
2140        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2141
2142        // Custom plural should exist
2143        assert!(doc.get("people").is_some());
2144        // Default plural should not exist
2145        assert!(doc.get("persons").is_none());
2146
2147        let list = doc.get("people").unwrap().as_list().unwrap();
2148        assert_eq!(list.type_name, "Person");
2149        assert_eq!(list.rows.len(), 1);
2150    }
2151
2152    #[test]
2153    fn test_custom_list_key_irregular_plurals() {
2154        // Test common irregular plurals
2155        let test_cases = vec![
2156            ("Person", "people"),
2157            ("Child", "children"),
2158            ("Tooth", "teeth"),
2159            ("Foot", "feet"),
2160            ("Mouse", "mice"),
2161            ("Goose", "geese"),
2162            ("Man", "men"),
2163            ("Woman", "women"),
2164            ("Ox", "oxen"),
2165            ("Datum", "data"),
2166        ];
2167
2168        for (type_name, plural) in test_cases {
2169            let csv_data = "id,value\n1,test\n".to_string();
2170            let config = FromCsvConfig {
2171                list_key: Some(plural.to_string()),
2172                ..Default::default()
2173            };
2174            let doc = from_csv_with_config(&csv_data, type_name, &["value"], config).unwrap();
2175
2176            assert!(
2177                doc.get(plural).is_some(),
2178                "Failed to find {plural} for type {type_name}"
2179            );
2180        }
2181    }
2182
2183    #[test]
2184    fn test_custom_list_key_collective_nouns() {
2185        let csv_data = "id,value\n1,42\n";
2186
2187        // Test collective nouns
2188        let test_cases = vec![
2189            ("Data", "dataset"),
2190            ("Information", "info_collection"),
2191            ("Equipment", "gear"),
2192            ("Furniture", "furnishings"),
2193        ];
2194
2195        for (type_name, collective) in test_cases {
2196            let config = FromCsvConfig {
2197                list_key: Some(collective.to_string()),
2198                ..Default::default()
2199            };
2200            let doc = from_csv_with_config(csv_data, type_name, &["value"], config).unwrap();
2201
2202            assert!(
2203                doc.get(collective).is_some(),
2204                "Failed to find {collective} for type {type_name}"
2205            );
2206        }
2207    }
2208
2209    #[test]
2210    fn test_custom_list_key_case_sensitive() {
2211        let csv_data = "id,value\n1,test\n";
2212        let config = FromCsvConfig {
2213            list_key: Some("MyCustomList".to_string()),
2214            ..Default::default()
2215        };
2216        let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2217
2218        // Exact case should exist
2219        assert!(doc.get("MyCustomList").is_some());
2220        // Different case should not exist
2221        assert!(doc.get("mycustomlist").is_none());
2222        assert!(doc.get("items").is_none());
2223    }
2224
2225    #[test]
2226    fn test_custom_list_key_empty_string() {
2227        // Empty string is technically allowed as a key
2228        let csv_data = "id,value\n1,test\n";
2229        let config = FromCsvConfig {
2230            list_key: Some(String::new()),
2231            ..Default::default()
2232        };
2233        let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2234
2235        assert!(doc.get("").is_some());
2236    }
2237
2238    #[test]
2239    fn test_custom_list_key_with_special_chars() {
2240        let csv_data = "id,value\n1,test\n";
2241        let config = FromCsvConfig {
2242            list_key: Some("my-custom_list.v2".to_string()),
2243            ..Default::default()
2244        };
2245        let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2246
2247        assert!(doc.get("my-custom_list.v2").is_some());
2248    }
2249
2250    #[test]
2251    fn test_custom_list_key_unicode() {
2252        let csv_data = "id,value\n1,test\n";
2253        let config = FromCsvConfig {
2254            list_key: Some("人々".to_string()), // Japanese for "people"
2255            ..Default::default()
2256        };
2257        let doc = from_csv_with_config(csv_data, "Person", &["value"], config).unwrap();
2258
2259        assert!(doc.get("人々").is_some());
2260    }
2261
2262    #[test]
2263    fn test_custom_list_key_with_schema_inference() {
2264        let csv_data = "id,value\n1,42\n2,43\n3,44\n";
2265        let config = FromCsvConfig {
2266            list_key: Some("people".to_string()),
2267            infer_schema: true,
2268            sample_rows: 10,
2269            ..Default::default()
2270        };
2271        let doc = from_csv_with_config(csv_data, "Person", &["value"], config).unwrap();
2272
2273        assert!(doc.get("people").is_some());
2274        let list = doc.get("people").unwrap().as_list().unwrap();
2275        assert_eq!(list.rows.len(), 3);
2276        // Schema inference should still work
2277        assert_eq!(list.rows[0].fields[1], Value::Int(42));
2278    }
2279
2280    #[test]
2281    fn test_custom_list_key_none_uses_default() {
2282        let csv_data = "id,name\n1,Alice\n";
2283        let config = FromCsvConfig {
2284            list_key: None,
2285            ..Default::default()
2286        };
2287        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2288
2289        // Should use default pluralization
2290        assert!(doc.get("persons").is_some());
2291        assert!(doc.get("people").is_none());
2292    }
2293
2294    #[test]
2295    fn test_custom_list_key_default_config() {
2296        let csv_data = "id,name\n1,Alice\n";
2297        let doc = from_csv(csv_data, "User", &["name"]).unwrap();
2298
2299        // Default should use simple pluralization
2300        assert!(doc.get("users").is_some());
2301    }
2302
2303    #[test]
2304    fn test_custom_list_key_preserves_type_name() {
2305        let csv_data = "id,name\n1,Alice\n";
2306        let config = FromCsvConfig {
2307            list_key: Some("people".to_string()),
2308            ..Default::default()
2309        };
2310        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2311
2312        let list = doc.get("people").unwrap().as_list().unwrap();
2313        // Type name should still be "Person", not "people"
2314        assert_eq!(list.type_name, "Person");
2315    }
2316
2317    #[test]
2318    fn test_custom_list_key_with_multiple_types() {
2319        // This test ensures each call can have its own list_key
2320        let csv1 = "id,name\n1,Alice\n";
2321        let config1 = FromCsvConfig {
2322            list_key: Some("people".to_string()),
2323            ..Default::default()
2324        };
2325        let doc1 = from_csv_with_config(csv1, "Person", &["name"], config1).unwrap();
2326
2327        let csv2 = "id,name\n1,Fluffy\n";
2328        let config2 = FromCsvConfig {
2329            list_key: Some("mice".to_string()),
2330            ..Default::default()
2331        };
2332        let doc2 = from_csv_with_config(csv2, "Mouse", &["name"], config2).unwrap();
2333
2334        assert!(doc1.get("people").is_some());
2335        assert!(doc1.get("persons").is_none());
2336
2337        assert!(doc2.get("mice").is_some());
2338        assert!(doc2.get("mouses").is_none());
2339    }
2340
2341    #[test]
2342    fn test_custom_list_key_numbers_in_name() {
2343        let csv_data = "id,value\n1,test\n";
2344        let config = FromCsvConfig {
2345            list_key: Some("items_v2".to_string()),
2346            ..Default::default()
2347        };
2348        let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2349
2350        assert!(doc.get("items_v2").is_some());
2351    }
2352
2353    #[test]
2354    fn test_custom_list_key_round_trip_compatibility() {
2355        // Ensure custom list keys work with to_csv_list
2356        let csv_data = "id,name\n1,Alice\n2,Bob\n";
2357        let config = FromCsvConfig {
2358            list_key: Some("people".to_string()),
2359            ..Default::default()
2360        };
2361        let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2362
2363        // Export the list using the custom key
2364        use crate::to_csv_list;
2365        let exported_csv = to_csv_list(&doc, "people").unwrap();
2366        assert!(exported_csv.contains("Alice"));
2367        assert!(exported_csv.contains("Bob"));
2368
2369        // Should not be accessible via default key
2370        assert!(to_csv_list(&doc, "persons").is_err());
2371    }
2372
2373    #[test]
2374    fn test_from_csv_config_clone_with_list_key() {
2375        let config = FromCsvConfig {
2376            delimiter: b',',
2377            has_headers: true,
2378            trim: true,
2379            max_rows: 1000,
2380            infer_schema: false,
2381            sample_rows: 50,
2382            list_key: Some("people".to_string()),
2383            max_columns: DEFAULT_MAX_COLUMNS,
2384            max_cell_size: DEFAULT_MAX_CELL_SIZE,
2385            max_total_size: DEFAULT_MAX_TOTAL_SIZE,
2386            max_header_size: DEFAULT_MAX_HEADER_SIZE,
2387        };
2388        let cloned = config.clone();
2389        assert_eq!(cloned.list_key, Some("people".to_string()));
2390    }
2391
2392    #[test]
2393    fn test_from_csv_config_debug_with_list_key() {
2394        let config = FromCsvConfig {
2395            list_key: Some("people".to_string()),
2396            ..Default::default()
2397        };
2398        let debug = format!("{config:?}");
2399        assert!(debug.contains("list_key"));
2400        assert!(debug.contains("people"));
2401    }
2402
2403    // ==================== Security Limit Tests ====================
2404
2405    #[test]
2406    fn test_from_csv_config_default_security_limits() {
2407        let config = FromCsvConfig::default();
2408        assert_eq!(config.max_columns, DEFAULT_MAX_COLUMNS);
2409        assert_eq!(config.max_cell_size, DEFAULT_MAX_CELL_SIZE);
2410        assert_eq!(config.max_total_size, DEFAULT_MAX_TOTAL_SIZE);
2411        assert_eq!(config.max_header_size, DEFAULT_MAX_HEADER_SIZE);
2412    }
2413
2414    #[test]
2415    fn test_from_csv_config_clone_with_security_limits() {
2416        let config = FromCsvConfig {
2417            max_columns: 5_000,
2418            max_cell_size: 2_000_000,
2419            max_total_size: 200_000_000,
2420            max_header_size: 2_000_000,
2421            ..Default::default()
2422        };
2423        let cloned = config.clone();
2424        assert_eq!(cloned.max_columns, 5_000);
2425        assert_eq!(cloned.max_cell_size, 2_000_000);
2426        assert_eq!(cloned.max_total_size, 200_000_000);
2427        assert_eq!(cloned.max_header_size, 2_000_000);
2428    }
2429
2430    #[test]
2431    fn test_from_csv_config_unlimited() {
2432        let config = FromCsvConfig::unlimited();
2433        assert_eq!(config.max_rows, usize::MAX);
2434        assert_eq!(config.max_columns, usize::MAX);
2435        assert_eq!(config.max_cell_size, usize::MAX);
2436        assert_eq!(config.max_total_size, usize::MAX);
2437        assert_eq!(config.max_header_size, usize::MAX);
2438    }
2439
2440    #[test]
2441    fn test_from_csv_config_strict() {
2442        let config = FromCsvConfig::strict();
2443        assert_eq!(config.max_rows, 1_000_000);
2444        assert_eq!(config.max_columns, 1_000);
2445        assert_eq!(config.max_cell_size, 65_536);
2446        assert_eq!(config.max_total_size, 10_485_760);
2447        assert_eq!(config.max_header_size, 65_536);
2448    }
2449
2450    #[test]
2451    fn test_column_count_limit_enforcement() {
2452        // Create CSV with 11,000 columns (exceeds default 10,000)
2453        let mut csv = String::from("col0");
2454        for i in 1..11_000 {
2455            csv.push_str(&format!(",col{i}"));
2456        }
2457        csv.push('\n');
2458        csv.push_str("a,");
2459        csv.push_str(&"b,".repeat(10_999));
2460        csv.push('b');
2461
2462        let result = from_csv_with_config(&csv, "Item", &[], FromCsvConfig::default());
2463
2464        assert!(result.is_err());
2465        let err = result.unwrap_err();
2466        assert!(matches!(err, CsvError::Security { .. }));
2467        assert!(err.to_string().contains("exceeds limit"));
2468    }
2469
2470    #[test]
2471    fn test_cell_size_limit_enforcement() {
2472        // Create CSV with 2MB cell (exceeds default 1MB)
2473        let huge_cell = "x".repeat(2_000_000);
2474        let csv = format!("id,data\n1,\"{huge_cell}\"\n");
2475
2476        let result = from_csv_with_config(&csv, "Item", &["data"], FromCsvConfig::default());
2477
2478        assert!(result.is_err());
2479        let err = result.unwrap_err();
2480        assert!(matches!(err, CsvError::Security { .. }));
2481        // Check that the error message contains information about the limit
2482        let err_msg = err.to_string();
2483        assert!(err_msg.contains("exceeds limit") || err_msg.contains("Security"));
2484    }
2485
2486    #[test]
2487    fn test_total_size_limit_enforcement() {
2488        // Create CSV with 110MB total data (exceeds default 100MB)
2489        let mut csv = String::from("id,data\n");
2490        let row_data = "x".repeat(100_000); // 100KB per row
2491
2492        for i in 0..1_100 {
2493            csv.push_str(&format!("{i},\"{row_data}\"\n"));
2494        }
2495
2496        let result = from_csv_with_config(&csv, "Item", &["data"], FromCsvConfig::default());
2497
2498        assert!(result.is_err());
2499        let err = result.unwrap_err();
2500        assert!(matches!(err, CsvError::Security { .. }));
2501        assert!(err.to_string().contains("total size"));
2502    }
2503
2504    #[test]
2505    fn test_header_size_limit_enforcement() {
2506        // Create CSV with 2MB total header size (exceeds default 1MB)
2507        let mut csv = String::new();
2508        for i in 0..20_000 {
2509            if i > 0 {
2510                csv.push(',');
2511            }
2512            csv.push_str(&format!("column_{i}_very_long_name_{i}"));
2513        }
2514        csv.push_str("\n1\n");
2515
2516        let result = from_csv_with_config(&csv, "Item", &[], FromCsvConfig::default());
2517
2518        assert!(result.is_err());
2519        let err = result.unwrap_err();
2520        assert!(matches!(err, CsvError::Security { .. }));
2521        // Check that the error message contains information about the limit
2522        let err_msg = err.to_string();
2523        assert!(err_msg.contains("exceeds limit") || err_msg.contains("Security"));
2524    }
2525
2526    #[test]
2527    fn test_normal_csv_within_limits() {
2528        // Normal CSV should work fine with default limits
2529        let csv_data = "id,name,age\n1,Alice,30\n2,Bob,25\n";
2530
2531        let result = from_csv_with_config(
2532            csv_data,
2533            "Person",
2534            &["name", "age"],
2535            FromCsvConfig::default(),
2536        );
2537
2538        assert!(result.is_ok());
2539    }
2540
2541    #[test]
2542    #[allow(clippy::needless_borrow)]
2543    fn test_unlimited_config_allows_large_csvs() {
2544        // Verify that unlimited() config allows huge CSVs
2545        let huge_cell = "x".repeat(10_000_000);
2546        let csv = format!("id,data\n1,\"{huge_cell}\"\n");
2547
2548        let config = FromCsvConfig::unlimited();
2549        let result = from_csv_with_config(&csv, "Item", &["data"], config);
2550
2551        // Should succeed
2552        assert!(result.is_ok());
2553    }
2554
2555    #[test]
2556    #[allow(clippy::needless_borrow)]
2557    fn test_strict_config_blocks_large_cells() {
2558        // Even a moderately large cell should fail with strict config
2559        let csv = format!("id,data\n1,\"{}\"\n", "x".repeat(100_000));
2560
2561        let config = FromCsvConfig::strict();
2562        let result = from_csv_with_config(&csv, "Item", &["data"], config);
2563
2564        // Should fail - 100KB exceeds strict 64KB limit
2565        assert!(result.is_err());
2566        assert!(matches!(result.unwrap_err(), CsvError::Security { .. }));
2567    }
2568
2569    #[test]
2570    #[allow(clippy::needless_borrow)]
2571    fn test_strict_config_allows_small_csvs() {
2572        // Small CSV should work with strict config
2573        let csv = "id,data\n1,small_data\n";
2574
2575        let config = FromCsvConfig::strict();
2576        let result = from_csv_with_config(&csv, "Item", &["data"], config);
2577
2578        assert!(result.is_ok());
2579    }
2580}
hedl_csv/from_csv.rs

hedl_csv/
from_csv.rs