hedl_csv/from_csv.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Convert CSV files to HEDL documents.
19
20use crate::error::{CsvError, Result};
21use hedl_core::lex::parse_expression_token;
22use hedl_core::lex::parse_tensor;
23use hedl_core::{Document, Item, MatrixList, Node, Value};
24use std::io::Read;
25
26/// Default maximum number of rows to prevent memory exhaustion.
27///
28/// This limit prevents Denial-of-Service attacks from maliciously large CSV files.
29/// The default is 1 million rows, which allows processing reasonably large datasets
30/// while preventing unbounded memory allocation.
31///
32/// # Security Considerations
33///
34/// - **Memory exhaustion**: Without a limit, attackers could provide CSV files with
35/// billions of rows, causing the application to allocate excessive memory and crash.
36/// - **Configurable**: The limit can be adjusted via `FromCsvConfig::max_rows` based on
37/// deployment context and available resources.
38/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
39///
40/// # Examples
41///
42/// ```
43/// # use hedl_csv::FromCsvConfig;
44/// // Use default 1M row limit
45/// let config = FromCsvConfig::default();
46/// assert_eq!(config.max_rows, 1_000_000);
47///
48/// // Increase limit for large dataset processing
49/// let config = FromCsvConfig {
50/// max_rows: 10_000_000, // 10 million rows
51/// ..Default::default()
52/// };
53/// ```
54pub const DEFAULT_MAX_ROWS: usize = 1_000_000;
55
56/// Default maximum number of columns to prevent column bomb attacks.
57///
58/// This limit prevents Denial-of-Service attacks from CSV files with excessive columns.
59/// The default is 10,000 columns, which is generous but prevents abuse.
60///
61/// # Security Considerations
62///
63/// - **Column bomb**: Without a limit, attackers could provide CSV files with
64/// hundreds of thousands of columns, causing memory exhaustion and slow processing.
65/// - **Industry standards**: Excel limits to 16,384 columns, Google Sheets to 18,278.
66/// - **Trade-off**: Higher limits allow wider datasets but increase `DoS` risk.
67pub const DEFAULT_MAX_COLUMNS: usize = 10_000;
68
69/// Default maximum cell size in bytes to prevent cell bomb attacks.
70///
71/// This limit prevents Denial-of-Service attacks from CSV files with enormous cells.
72/// The default is 1MB per cell, which is reasonable for most legitimate use cases.
73///
74/// # Security Considerations
75///
76/// - **Cell bomb**: Without a limit, attackers could provide CSV files with
77/// gigabyte-sized cells, causing memory exhaustion.
78/// - **Cumulative effect**: Multiple large cells multiply the impact.
79/// - **Trade-off**: Higher limits allow larger text fields but increase `DoS` risk.
80pub const DEFAULT_MAX_CELL_SIZE: usize = 1_048_576; // 1MB
81
82/// Default maximum total CSV size in bytes to prevent decompression bombs.
83///
84/// This limit prevents Denial-of-Service attacks from compressed CSV files that
85/// decompress to enormous sizes. The default is 100MB.
86///
87/// # Security Considerations
88///
89/// - **Decompression bomb**: A 1MB gzipped file could decompress to 1GB+.
90/// - **Memory exhaustion**: Prevents attackers from filling server memory.
91/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
92pub const DEFAULT_MAX_TOTAL_SIZE: usize = 104_857_600; // 100MB
93
94/// Default maximum header size in bytes to prevent header bombs.
95///
96/// This limit prevents Denial-of-Service attacks from CSV files with enormous headers.
97/// The default is 1MB for the total header size.
98///
99/// # Security Considerations
100///
101/// - **Header bomb**: Prevents attackers from using huge column names.
102/// - **Per-column**: Also enforced per-column via `max_cell_size`.
103/// - **Trade-off**: Higher limits allow longer column names but increase `DoS` risk.
104pub const DEFAULT_MAX_HEADER_SIZE: usize = 1_048_576; // 1MB
105
106/// Configuration for CSV parsing.
107///
108/// This structure controls all aspects of CSV parsing behavior, including delimiters,
109/// headers, whitespace handling, security limits, and custom list naming.
110///
111/// # Examples
112///
113/// ## Default Configuration
114///
115/// ```
116/// # use hedl_csv::FromCsvConfig;
117/// let config = FromCsvConfig::default();
118/// assert_eq!(config.delimiter, b',');
119/// assert!(config.has_headers);
120/// assert!(config.trim);
121/// assert_eq!(config.max_rows, 1_000_000);
122/// assert_eq!(config.list_key, None);
123/// ```
124///
125/// ## Tab-Delimited without Headers
126///
127/// ```
128/// # use hedl_csv::FromCsvConfig;
129/// let config = FromCsvConfig {
130/// delimiter: b'\t',
131/// has_headers: false,
132/// ..Default::default()
133/// };
134/// ```
135///
136/// ## Custom Row Limit for Large Datasets
137///
138/// ```
139/// # use hedl_csv::FromCsvConfig;
140/// let config = FromCsvConfig {
141/// max_rows: 10_000_000, // Allow up to 10M rows
142/// ..Default::default()
143/// };
144/// ```
145///
146/// ## Disable Whitespace Trimming
147///
148/// ```
149/// # use hedl_csv::FromCsvConfig;
150/// let config = FromCsvConfig {
151/// trim: false,
152/// ..Default::default()
153/// };
154/// ```
155///
156/// ## Enable Schema Inference
157///
158/// ```
159/// # use hedl_csv::FromCsvConfig;
160/// let config = FromCsvConfig {
161/// infer_schema: true,
162/// sample_rows: 200, // Sample first 200 rows
163/// ..Default::default()
164/// };
165/// ```
166///
167/// ## Custom List Key for Irregular Plurals
168///
169/// ```
170/// # use hedl_csv::FromCsvConfig;
171/// // For "Person" type, use "people" instead of default "persons"
172/// let config = FromCsvConfig {
173/// list_key: Some("people".to_string()),
174/// ..Default::default()
175/// };
176/// ```
177#[derive(Debug, Clone)]
178pub struct FromCsvConfig {
179 /// Field delimiter character (default: `,`).
180 ///
181 /// Common alternatives:
182 /// - `b'\t'` - Tab-separated values (TSV)
183 /// - `b';'` - Semicolon-separated (common in European locales)
184 /// - `b'|'` - Pipe-separated
185 pub delimiter: u8,
186
187 /// Whether the first row contains column headers (default: `true`).
188 ///
189 /// When `true`, the first row is interpreted as column names and not included
190 /// in the data. When `false`, all rows are treated as data.
191 pub has_headers: bool,
192
193 /// Whether to trim leading/trailing whitespace from fields (default: `true`).
194 ///
195 /// When `true`, fields like `" value "` become `"value"`. This is generally
196 /// recommended to handle inconsistently formatted CSV files.
197 pub trim: bool,
198
199 /// Maximum number of rows to parse (default: 1,000,000).
200 ///
201 /// This security limit prevents memory exhaustion from maliciously large CSV files.
202 /// Processing stops with an error if more rows are encountered.
203 ///
204 /// # Security Impact
205 ///
206 /// - **`DoS` Protection**: Prevents attackers from causing memory exhaustion
207 /// - **Memory Bound**: Limits worst-case memory usage to approximately
208 /// `max_rows × avg_row_size × columns`
209 /// - **Recommended Values**:
210 /// - Small deployments: 100,000 - 1,000,000 rows
211 /// - Large deployments: 1,000,000 - 10,000,000 rows
212 /// - Batch processing: Adjust based on available RAM
213 ///
214 /// # Example
215 ///
216 /// ```
217 /// # use hedl_csv::FromCsvConfig;
218 /// // For processing very large datasets on a high-memory server
219 /// let config = FromCsvConfig {
220 /// max_rows: 50_000_000,
221 /// ..Default::default()
222 /// };
223 /// ```
224 pub max_rows: usize,
225
226 /// Whether to automatically infer column types from data (default: `false`).
227 ///
228 /// When `true`, the parser samples the first `sample_rows` to determine the
229 /// most specific type for each column. When `false`, uses standard per-value
230 /// type inference.
231 ///
232 /// # Type Inference Hierarchy (most to least specific)
233 ///
234 /// 1. **Null**: All values are empty/null
235 /// 2. **Bool**: All values are "true" or "false"
236 /// 3. **Int**: All values parse as integers
237 /// 4. **Float**: All values parse as floats
238 /// 5. **String**: Fallback for all other cases
239 ///
240 /// # Example
241 ///
242 /// ```
243 /// # use hedl_csv::FromCsvConfig;
244 /// let config = FromCsvConfig {
245 /// infer_schema: true,
246 /// sample_rows: 100,
247 /// ..Default::default()
248 /// };
249 /// ```
250 pub infer_schema: bool,
251
252 /// Number of rows to sample for schema inference (default: 100).
253 ///
254 /// Only used when `infer_schema` is `true`. Larger sample sizes provide
255 /// more accurate type detection but slower initial processing.
256 ///
257 /// # Trade-offs
258 ///
259 /// - **Small (10-50)**: Fast inference, may miss edge cases
260 /// - **Medium (100-500)**: Balanced accuracy and performance
261 /// - **Large (1000+)**: High accuracy, slower for large datasets
262 pub sample_rows: usize,
263
264 /// Custom key name for the matrix list in the document (default: `None`).
265 ///
266 /// When `None`, the list key is automatically generated by adding 's' to the
267 /// lowercased type name (e.g., "Person" → "persons"). When `Some`, uses the
268 /// specified custom key instead.
269 ///
270 /// # Use Cases
271 ///
272 /// - **Irregular Plurals**: "Person" → "people" instead of "persons"
273 /// - **Collective Nouns**: "Data" → "dataset" instead of "datas"
274 /// - **Custom Naming**: Any non-standard naming convention
275 /// - **Case-Sensitive Keys**: Preserve specific casing requirements
276 ///
277 /// # Examples
278 ///
279 /// ## Irregular Plural
280 ///
281 /// ```
282 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
283 /// let csv = "id,name\n1,Alice\n";
284 /// let config = FromCsvConfig {
285 /// list_key: Some("people".to_string()),
286 /// ..Default::default()
287 /// };
288 /// let doc = from_csv_with_config(csv, "Person", &["name"], config).unwrap();
289 /// assert!(doc.get("people").is_some()); // Uses custom plural
290 /// assert!(doc.get("persons").is_none()); // Default plural not used
291 /// ```
292 ///
293 /// ## Collective Noun
294 ///
295 /// ```
296 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
297 /// let csv = "id,value\n1,42\n";
298 /// let config = FromCsvConfig {
299 /// list_key: Some("dataset".to_string()),
300 /// ..Default::default()
301 /// };
302 /// let doc = from_csv_with_config(csv, "Data", &["value"], config).unwrap();
303 /// assert!(doc.get("dataset").is_some());
304 /// ```
305 ///
306 /// ## Case-Sensitive Key
307 ///
308 /// ```
309 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
310 /// let csv = "id,value\n1,test\n";
311 /// let config = FromCsvConfig {
312 /// list_key: Some("MyCustomList".to_string()),
313 /// ..Default::default()
314 /// };
315 /// let doc = from_csv_with_config(csv, "Item", &["value"], config).unwrap();
316 /// assert!(doc.get("MyCustomList").is_some());
317 /// ```
318 pub list_key: Option<String>,
319
320 /// Maximum number of columns allowed (default: 10,000).
321 ///
322 /// This security limit prevents "column bomb" attacks where malicious CSV files
323 /// contain excessive columns that cause memory exhaustion and slow processing.
324 ///
325 /// # Security Impact
326 ///
327 /// - **`DoS` Protection**: Prevents attackers from creating CSVs with 50,000+ columns
328 /// - **Memory Bound**: Limits worst-case memory usage for column metadata
329 /// - **Industry Comparison**: Excel (16,384), Google Sheets (18,278), `PostgreSQL` (~1,600)
330 /// - **Recommended Values**:
331 /// - Web uploads: 1,000 - 10,000 columns
332 /// - Internal processing: 10,000 - 50,000 columns
333 /// - Scientific data: Adjust based on requirements
334 ///
335 /// # Example
336 ///
337 /// ```
338 /// # use hedl_csv::FromCsvConfig;
339 /// // For processing wide scientific datasets
340 /// let config = FromCsvConfig {
341 /// max_columns: 50_000,
342 /// ..Default::default()
343 /// };
344 /// ```
345 pub max_columns: usize,
346
347 /// Maximum size of a single cell in bytes (default: 1MB).
348 ///
349 /// This security limit prevents "cell bomb" attacks where malicious CSV files
350 /// contain enormous individual cells that cause memory exhaustion.
351 ///
352 /// # Security Impact
353 ///
354 /// - **`DoS` Protection**: Prevents attackers from using 10MB+ cells
355 /// - **Memory Bound**: Each cell is read into memory as a String
356 /// - **Cumulative**: Multiple large cells multiply the impact
357 /// - **Recommended Values**:
358 /// - Web uploads: 64KB - 1MB
359 /// - Internal processing: 1MB - 10MB
360 /// - Text-heavy data: Adjust based on requirements
361 ///
362 /// # Example
363 ///
364 /// ```
365 /// # use hedl_csv::FromCsvConfig;
366 /// // For processing long text fields (e.g., descriptions, comments)
367 /// let config = FromCsvConfig {
368 /// max_cell_size: 5_242_880, // 5MB
369 /// ..Default::default()
370 /// };
371 /// ```
372 pub max_cell_size: usize,
373
374 /// Maximum total CSV size in bytes after decompression (default: 100MB).
375 ///
376 /// This security limit prevents "decompression bomb" attacks where compressed
377 /// CSV files decompress to enormous sizes. A 1MB gzipped file could decompress
378 /// to 1GB+, bypassing file size checks.
379 ///
380 /// # Security Impact
381 ///
382 /// - **`DoS` Protection**: Prevents decompression bombs
383 /// - **Memory Bound**: Tracks total bytes read during parsing
384 /// - **Transparent**: Works even if CSV library handles decompression
385 /// - **Recommended Values**:
386 /// - Web uploads: 10MB - 100MB
387 /// - Internal processing: 100MB - 1GB
388 /// - Big data: Adjust based on available RAM
389 ///
390 /// # Example
391 ///
392 /// ```
393 /// # use hedl_csv::FromCsvConfig;
394 /// // For processing large datasets on high-memory servers
395 /// let config = FromCsvConfig {
396 /// max_total_size: 1_073_741_824, // 1GB
397 /// ..Default::default()
398 /// };
399 /// ```
400 pub max_total_size: usize,
401
402 /// Maximum size of header row in bytes (default: 1MB).
403 ///
404 /// This security limit prevents "header bomb" attacks where malicious CSV files
405 /// have enormous column names or excessive total header size.
406 ///
407 /// # Security Impact
408 ///
409 /// - **`DoS` Protection**: Prevents huge column names (e.g., 1MB per column)
410 /// - **Memory Bound**: Limits memory for header parsing
411 /// - **Combined with `max_columns`**: Total size = `column_count` × `avg_name_length`
412 /// - **Recommended Values**:
413 /// - Web uploads: 64KB - 1MB
414 /// - Internal processing: 1MB - 10MB
415 /// - Verbose column naming: Adjust based on requirements
416 ///
417 /// # Example
418 ///
419 /// ```
420 /// # use hedl_csv::FromCsvConfig;
421 /// // For datasets with very descriptive column names
422 /// let config = FromCsvConfig {
423 /// max_header_size: 5_242_880, // 5MB
424 /// ..Default::default()
425 /// };
426 /// ```
427 pub max_header_size: usize,
428}
429
430impl Default for FromCsvConfig {
431 fn default() -> Self {
432 Self {
433 delimiter: b',',
434 has_headers: true,
435 trim: true,
436 max_rows: DEFAULT_MAX_ROWS,
437 infer_schema: false,
438 sample_rows: 100,
439 list_key: None,
440 max_columns: DEFAULT_MAX_COLUMNS,
441 max_cell_size: DEFAULT_MAX_CELL_SIZE,
442 max_total_size: DEFAULT_MAX_TOTAL_SIZE,
443 max_header_size: DEFAULT_MAX_HEADER_SIZE,
444 }
445 }
446}
447
448impl FromCsvConfig {
449 /// Creates a config with NO security limits (use for trusted input only).
450 ///
451 /// # Security Warning
452 ///
453 /// This configuration disables ALL security limits. Only use this for:
454 /// - Trusted internal data sources
455 /// - Controlled batch processing environments
456 /// - Known-good CSV files
457 ///
458 /// **DO NOT** use this for:
459 /// - User uploads
460 /// - Web service inputs
461 /// - Untrusted data sources
462 ///
463 /// # Examples
464 ///
465 /// ```
466 /// # use hedl_csv::FromCsvConfig;
467 /// // For internal batch processing with trusted data
468 /// let config = FromCsvConfig::unlimited();
469 /// ```
470 #[must_use]
471 pub fn unlimited() -> Self {
472 Self {
473 max_rows: usize::MAX,
474 max_columns: usize::MAX,
475 max_cell_size: usize::MAX,
476 max_total_size: usize::MAX,
477 max_header_size: usize::MAX,
478 ..Default::default()
479 }
480 }
481
482 /// Creates a config with strict limits for untrusted input.
483 ///
484 /// # Security
485 ///
486 /// This configuration provides stricter limits suitable for:
487 /// - Web service uploads
488 /// - User-submitted CSV files
489 /// - Untrusted data sources
490 /// - Rate-limited APIs
491 ///
492 /// # Limits
493 ///
494 /// - `max_rows`: 1,000,000 (same as default)
495 /// - `max_columns`: 1,000 (stricter than default 10,000)
496 /// - `max_cell_size`: 64KB (stricter than default 1MB)
497 /// - `max_total_size`: 10MB (stricter than default 100MB)
498 /// - `max_header_size`: 64KB (stricter than default 1MB)
499 ///
500 /// # Examples
501 ///
502 /// ```
503 /// # use hedl_csv::FromCsvConfig;
504 /// // For user uploads in a web service
505 /// let config = FromCsvConfig::strict();
506 /// ```
507 #[must_use]
508 pub fn strict() -> Self {
509 Self {
510 max_rows: 1_000_000,
511 max_columns: 1_000,
512 max_cell_size: 65_536,
513 max_total_size: 10_485_760,
514 max_header_size: 65_536,
515 ..Default::default()
516 }
517 }
518}
519
520/// Parse CSV string into a HEDL document with default configuration.
521///
522/// This is the primary entry point for CSV parsing. It uses sensible defaults:
523/// - Comma delimiter
524/// - Headers expected in first row
525/// - Whitespace trimming enabled
526/// - 1 million row limit for security
527///
528/// # Arguments
529///
530/// * `csv` - The CSV string to parse
531/// * `type_name` - The HEDL type name for rows (e.g., "Person")
532/// * `schema` - Column names excluding the 'id' column (which is always first)
533///
534/// # Returns
535///
536/// A `Document` containing a single matrix list with the parsed data, or an error
537/// if parsing fails.
538///
539/// # Errors
540///
541/// Returns `HedlError` in the following cases:
542///
543/// - `Syntax`: Malformed CSV records or invalid UTF-8
544/// - `Schema`: Missing ID column or field count mismatch
545/// - `Semantic`: Empty ID field
546/// - `Security`: Row count exceeds maximum (default 1M rows)
547///
548/// # Type Inference
549///
550/// Values are automatically inferred from CSV text:
551///
552/// - Empty string or `~` → `Value::Null`
553/// - `true`/`false` → `Value::Bool`
554/// - Integer pattern → `Value::Int` (e.g., "42", "-123")
555/// - Float pattern → `Value::Float` (e.g., "3.14", "1.5e10")
556/// - Special floats: `NaN`, `Infinity`, `-Infinity`
557/// - `@id` or `@Type:id` → `Value::Reference`
558/// - `$(expr)` → `Value::Expression`
559/// - `[1,2,3]` → `Value::Tensor`
560/// - Otherwise → `Value::String`
561///
562/// # Examples
563///
564/// ## Basic Usage
565///
566/// ```
567/// use hedl_csv::from_csv;
568/// use hedl_core::Value;
569///
570/// let csv_data = "id,name,age\n1,Alice,30\n2,Bob,25";
571/// let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
572///
573/// // Access the parsed data
574/// let list = doc.get("persons").unwrap().as_list().unwrap();
575/// assert_eq!(list.rows.len(), 2);
576/// assert_eq!(list.rows[0].id, "1");
577/// ```
578///
579/// ## Mixed Type Inference
580///
581/// ```
582/// use hedl_csv::from_csv;
583/// use hedl_core::Value;
584///
585/// let csv_data = "id,value\n1,42\n2,3.14\n3,true\n4,hello";
586/// let doc = from_csv(csv_data, "Item", &["value"]).unwrap();
587///
588/// let list = doc.get("items").unwrap().as_list().unwrap();
589/// assert!(matches!(list.rows[0].fields[1], Value::Int(42)));
590/// assert!(matches!(list.rows[1].fields[1], Value::Float(f) if (f - 3.14).abs() < 0.001));
591/// assert!(matches!(list.rows[2].fields[1], Value::Bool(true)));
592/// assert!(matches!(list.rows[3].fields[1], Value::String(_)));
593/// ```
594///
595/// ## References
596///
597/// ```
598/// use hedl_csv::from_csv;
599///
600/// let csv_data = "id,owner\n1,@user1\n2,@User:alice";
601/// let doc = from_csv(csv_data, "Item", &["owner"]).unwrap();
602///
603/// let list = doc.get("items").unwrap().as_list().unwrap();
604/// let ref1 = list.rows[0].fields[1].as_reference().unwrap();
605/// assert_eq!(&*ref1.id, "user1");
606/// assert_eq!(ref1.type_name, None); // Local reference
607///
608/// let ref2 = list.rows[1].fields[1].as_reference().unwrap();
609/// assert_eq!(&*ref2.id, "alice");
610/// assert_eq!(ref2.type_name.as_deref(), Some("User")); // Qualified reference
611/// ```
612///
613/// # Performance
614///
615/// - **Streaming**: Processes CSV row-by-row to minimize memory usage
616/// - **Memory bound**: O(rows × columns) space complexity
617/// - **Time complexity**: O(rows × columns) with efficient parsing
618///
619/// For very large files, consider using `from_csv_reader` for file I/O or
620/// increasing `max_rows` via `from_csv_with_config`.
621///
622/// # See Also
623///
624/// - `from_csv_with_config` - For custom delimiters, row limits, etc.
625/// - `from_csv_reader` - For parsing from files or network streams
626pub fn from_csv(csv: &str, type_name: &str, schema: &[&str]) -> Result<Document> {
627 from_csv_with_config(csv, type_name, schema, FromCsvConfig::default())
628}
629
630/// Parse CSV string into a HEDL document with custom configuration.
631///
632/// This function provides full control over CSV parsing behavior through `FromCsvConfig`.
633///
634/// # Arguments
635///
636/// * `csv` - The CSV string to parse
637/// * `type_name` - The HEDL type name for rows
638/// * `schema` - Column names excluding the 'id' column
639/// * `config` - Configuration controlling delimiter, headers, trimming, and row limits
640///
641/// # Examples
642///
643/// ## Tab-Separated Values (TSV)
644///
645/// ```
646/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
647///
648/// let tsv_data = "id\tname\tage\n1\tAlice\t30";
649/// let config = FromCsvConfig {
650/// delimiter: b'\t',
651/// ..Default::default()
652/// };
653/// let doc = from_csv_with_config(tsv_data, "Person", &["name", "age"], config).unwrap();
654/// ```
655///
656/// ## Custom Row Limit
657///
658/// ```
659/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
660///
661/// let config = FromCsvConfig {
662/// max_rows: 10_000_000, // Allow up to 10M rows
663/// ..Default::default()
664/// };
665/// let csv_data = "id,value\n1,test";
666/// let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
667/// ```
668///
669/// ## Disable Whitespace Trimming
670///
671/// ```
672/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
673/// use hedl_core::Value;
674///
675/// let csv_data = "id,name\n1, Alice ";
676/// let config = FromCsvConfig {
677/// trim: false,
678/// ..Default::default()
679/// };
680/// let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
681///
682/// let list = doc.get("persons").unwrap().as_list().unwrap();
683/// assert_eq!(list.rows[0].fields[1], Value::String(" Alice ".to_string().into()));
684/// ```
685///
686/// # See Also
687///
688/// - `from_csv` - Convenience function with default configuration
689/// - `from_csv_reader_with_config` - For streaming from files/network
690pub fn from_csv_with_config(
691 csv: &str,
692 type_name: &str,
693 schema: &[&str],
694 config: FromCsvConfig,
695) -> Result<Document> {
696 from_csv_reader_with_config(csv.as_bytes(), type_name, schema, config)
697}
698
699/// Parse CSV from a reader into a HEDL document with default configuration.
700///
701/// This function is useful for processing CSV files or network streams without
702/// loading the entire content into memory first.
703///
704/// # Arguments
705///
706/// * `reader` - Any type implementing `Read` (e.g., `File`, `TcpStream`, `&[u8]`)
707/// * `type_name` - The HEDL type name for rows
708/// * `schema` - Column names excluding the 'id' column
709///
710/// # Examples
711///
712/// ## Reading from a File
713///
714/// ```no_run
715/// use hedl_csv::from_csv_reader;
716/// use std::fs::File;
717///
718/// let file = File::open("data.csv").unwrap();
719/// let doc = from_csv_reader(file, "Person", &["name", "age"]).unwrap();
720/// ```
721///
722/// ## Reading from a Byte Slice
723///
724/// ```
725/// use hedl_csv::from_csv_reader;
726///
727/// let csv_bytes = b"id,name\n1,Alice";
728/// let doc = from_csv_reader(&csv_bytes[..], "Person", &["name"]).unwrap();
729/// ```
730///
731/// ## Reading from Standard Input
732///
733/// ```no_run
734/// use hedl_csv::from_csv_reader;
735/// use std::io;
736///
737/// let stdin = io::stdin();
738/// let doc = from_csv_reader(stdin.lock(), "Record", &["field1", "field2"]).unwrap();
739/// ```
740///
741/// # Performance
742///
743/// This function uses streaming I/O to minimize memory usage. The CSV data is
744/// processed row-by-row without buffering the entire file.
745///
746/// # See Also
747///
748/// - `from_csv_reader_with_config` - For custom delimiters and limits
749/// - `from_csv` - For parsing CSV strings
750pub fn from_csv_reader<R: Read>(reader: R, type_name: &str, schema: &[&str]) -> Result<Document> {
751 from_csv_reader_with_config(reader, type_name, schema, FromCsvConfig::default())
752}
753
754/// Inferred column type from sampling CSV data.
755#[derive(Debug, Clone, Copy, PartialEq, Eq)]
756enum ColumnType {
757 /// All sampled values are null/empty
758 Null,
759 /// All sampled values are "true" or "false"
760 Bool,
761 /// All sampled values parse as integers
762 Int,
763 /// All sampled values parse as floats (but not all as integers)
764 Float,
765 /// Default fallback for mixed or string data
766 String,
767}
768
769/// Infer the type of a single column from sampled values.
770///
771/// # Type Inference Rules
772///
773/// The function examines non-null values and determines the most specific type:
774///
775/// 1. If all values are null → `ColumnType::Null`
776/// 2. If all values are "true"/"false" → `ColumnType::Bool`
777/// 3. If all values parse as i64 → `ColumnType::Int`
778/// 4. If all values parse as f64 → `ColumnType::Float`
779/// 5. Otherwise → `ColumnType::String`
780///
781/// # Arguments
782///
783/// * `values` - Iterator over string values from a column
784///
785/// # Examples
786///
787/// ```text
788/// let values = vec!["1", "2", "3"];
789/// let col_type = infer_column_type(values.iter().map(|s| s.as_str()));
790/// assert_eq!(col_type, ColumnType::Int);
791/// ```
792fn infer_column_type<'a, I>(values: I) -> ColumnType
793where
794 I: Iterator<Item = &'a str>,
795{
796 let mut all_null = true;
797 let mut all_bool = true;
798 let mut all_int = true;
799 let mut all_float = true;
800
801 for value in values {
802 let trimmed = value.trim();
803
804 // Skip null values (don't affect type inference)
805 if trimmed.is_empty() || trimmed == "~" || trimmed == "null" {
806 continue;
807 }
808
809 all_null = false;
810
811 // Check bool
812 if trimmed != "true" && trimmed != "false" {
813 all_bool = false;
814 }
815
816 // Check int
817 if trimmed.parse::<i64>().is_err() {
818 all_int = false;
819 }
820
821 // Check float
822 if trimmed.parse::<f64>().is_err() {
823 all_float = false;
824 }
825
826 // Early exit if we know it's a string
827 if !all_bool && !all_int && !all_float {
828 return ColumnType::String;
829 }
830 }
831
832 // Determine type based on inference (most specific to least)
833 if all_null {
834 ColumnType::Null
835 } else if all_bool {
836 ColumnType::Bool
837 } else if all_int {
838 ColumnType::Int
839 } else if all_float {
840 ColumnType::Float
841 } else {
842 ColumnType::String
843 }
844}
845
846/// Infer types for all columns by sampling CSV records.
847///
848/// # Arguments
849///
850/// * `records` - Slice of CSV records (each record is a Vec<String>)
851/// * `sample_size` - Maximum number of records to sample
852///
853/// # Returns
854///
855/// A vector of `ColumnType` for each column in the CSV.
856///
857/// # Examples
858///
859/// ```text
860/// let records = vec![
861/// vec!["1".to_string(), "Alice".to_string(), "30".to_string()],
862/// vec!["2".to_string(), "Bob".to_string(), "25".to_string()],
863/// ];
864/// let types = infer_column_types(&records, 100);
865/// assert_eq!(types, vec![ColumnType::Int, ColumnType::String, ColumnType::Int]);
866/// ```
867fn infer_column_types(records: &[Vec<String>], sample_size: usize) -> Vec<ColumnType> {
868 if records.is_empty() {
869 return Vec::new();
870 }
871
872 let num_columns = records[0].len();
873 let sample_count = sample_size.min(records.len());
874
875 (0..num_columns)
876 .map(|col_idx| {
877 let column_values = records
878 .iter()
879 .take(sample_count)
880 .filter_map(|row| row.get(col_idx).map(std::string::String::as_str));
881
882 infer_column_type(column_values)
883 })
884 .collect()
885}
886
887/// Parse a CSV value using a specific inferred type.
888///
889/// This function forces type conversion based on the inferred schema,
890/// falling back to string on conversion failure.
891///
892/// # Arguments
893///
894/// * `field` - The string value to parse
895/// * `col_type` - The inferred column type
896///
897/// # Returns
898///
899/// A HEDL `Value` of the specified type, or `Value::String` if conversion fails.
900fn parse_csv_value_with_type(field: &str, col_type: ColumnType) -> Result<Value> {
901 let trimmed = field.trim();
902
903 // Always handle null values regardless of inferred type
904 if trimmed.is_empty() || trimmed == "~" {
905 return Ok(Value::Null);
906 }
907
908 match col_type {
909 ColumnType::Null => Ok(Value::Null),
910 ColumnType::Bool => {
911 if trimmed == "true" {
912 Ok(Value::Bool(true))
913 } else if trimmed == "false" {
914 Ok(Value::Bool(false))
915 } else {
916 // Fallback to string if not a valid bool
917 Ok(Value::String(field.to_string().into()))
918 }
919 }
920 ColumnType::Int => {
921 if let Ok(n) = trimmed.parse::<i64>() {
922 Ok(Value::Int(n))
923 } else {
924 // Fallback to string if not a valid int
925 Ok(Value::String(field.to_string().into()))
926 }
927 }
928 ColumnType::Float => {
929 if let Ok(f) = trimmed.parse::<f64>() {
930 Ok(Value::Float(f))
931 } else {
932 // Fallback to string if not a valid float
933 Ok(Value::String(field.to_string().into()))
934 }
935 }
936 ColumnType::String => {
937 // Use the original parse_csv_value for full type detection
938 // (handles references, expressions, tensors, etc.)
939 parse_csv_value(field)
940 }
941 }
942}
943
944/// Validate CSV headers against security limits.
945///
946/// This function checks:
947/// - Column count does not exceed `max_columns`
948/// - Total header size does not exceed `max_header_size`
949/// - Individual column name size does not exceed `max_cell_size`
950///
951/// # Arguments
952///
953/// * `headers` - The CSV header record
954/// * `config` - Configuration containing security limits
955///
956/// # Returns
957///
958/// `Ok(())` if all checks pass, otherwise an error.
959fn validate_headers(headers: &csv::StringRecord, config: &FromCsvConfig) -> Result<()> {
960 // Check column count
961 let column_count = headers.len();
962 if column_count > config.max_columns {
963 return Err(CsvError::Security {
964 limit_type: "column count".to_string(),
965 limit: config.max_columns,
966 actual: column_count,
967 message: format!(
968 "CSV has {} columns, exceeds limit of {}",
969 column_count, config.max_columns
970 ),
971 });
972 }
973
974 // Check total header size
975 let header_size: usize = headers.iter().map(str::len).sum();
976 if header_size > config.max_header_size {
977 return Err(CsvError::Security {
978 limit_type: "header size".to_string(),
979 limit: config.max_header_size,
980 actual: header_size,
981 message: format!(
982 "CSV header size {} bytes, exceeds limit of {} bytes",
983 header_size, config.max_header_size
984 ),
985 });
986 }
987
988 // Check for individual column name size (prevent single huge name)
989 for (i, header) in headers.iter().enumerate() {
990 if header.len() > config.max_cell_size {
991 // Safely create preview by finding the last complete character before byte 100
992 let preview = if header.len() > 100 {
993 // Find the last character boundary before position 100
994 let mut preview_end = 100;
995 while !header.is_char_boundary(preview_end) && preview_end > 0 {
996 preview_end -= 1;
997 }
998 format!("{}...", &header[..preview_end])
999 } else {
1000 header.to_string()
1001 };
1002 return Err(CsvError::Security {
1003 limit_type: "column name size".to_string(),
1004 limit: config.max_cell_size,
1005 actual: header.len(),
1006 message: format!(
1007 "Column name '{}' at index {} is {} bytes, exceeds cell size limit of {} bytes",
1008 preview,
1009 i,
1010 header.len(),
1011 config.max_cell_size
1012 ),
1013 });
1014 }
1015 }
1016
1017 Ok(())
1018}
1019
1020/// Validate a single cell against security limits.
1021///
1022/// This function checks that the cell size does not exceed `max_cell_size`.
1023///
1024/// # Arguments
1025///
1026/// * `cell` - The cell content to validate
1027/// * `row` - Row number (1-based, for error messages)
1028/// * `column` - Column index (0-based, for error messages)
1029/// * `config` - Configuration containing security limits
1030///
1031/// # Returns
1032///
1033/// `Ok(())` if the cell is within limits, otherwise an error.
1034fn validate_cell(cell: &str, row: usize, column: usize, config: &FromCsvConfig) -> Result<()> {
1035 if cell.len() > config.max_cell_size {
1036 // Safely create preview by finding the last complete character before byte 100
1037 let preview = if cell.len() > 100 {
1038 // Find the last character boundary before position 100
1039 let mut preview_end = 100;
1040 while !cell.is_char_boundary(preview_end) && preview_end > 0 {
1041 preview_end -= 1;
1042 }
1043 format!("{}...", &cell[..preview_end])
1044 } else {
1045 cell.to_string()
1046 };
1047 return Err(CsvError::Security {
1048 limit_type: "cell size".to_string(),
1049 limit: config.max_cell_size,
1050 actual: cell.len(),
1051 message: format!(
1052 "Cell at row {}, column {} is {} bytes, exceeds limit of {} bytes. Content preview: '{}'",
1053 row,
1054 column,
1055 cell.len(),
1056 config.max_cell_size,
1057 preview
1058 ),
1059 });
1060 }
1061 Ok(())
1062}
1063
1064/// Tracker for CSV size during parsing.
1065///
1066/// This struct tracks the total bytes read during CSV parsing to prevent
1067/// decompression bomb attacks.
1068struct CsvSizeTracker {
1069 bytes_read: usize,
1070 max_total_size: usize,
1071}
1072
1073impl CsvSizeTracker {
1074 /// Create a new size tracker with the specified maximum.
1075 fn new(max_total_size: usize) -> Self {
1076 Self {
1077 bytes_read: 0,
1078 max_total_size,
1079 }
1080 }
1081
1082 /// Track a record and check if the total size exceeds the limit.
1083 ///
1084 /// # Arguments
1085 ///
1086 /// * `record` - The CSV record to track
1087 ///
1088 /// # Returns
1089 ///
1090 /// `Ok(())` if within limits, otherwise an error.
1091 fn track_record(&mut self, record: &csv::StringRecord) -> Result<()> {
1092 let record_size: usize = record.iter().map(str::len).sum();
1093 self.bytes_read += record_size;
1094
1095 if self.bytes_read > self.max_total_size {
1096 return Err(CsvError::Security {
1097 limit_type: "total size".to_string(),
1098 limit: self.max_total_size,
1099 actual: self.bytes_read,
1100 message: format!(
1101 "CSV total size {} bytes exceeds limit of {} bytes",
1102 self.bytes_read, self.max_total_size
1103 ),
1104 });
1105 }
1106
1107 Ok(())
1108 }
1109
1110 /// Get the current total bytes read.
1111 #[allow(dead_code)]
1112 fn bytes_read(&self) -> usize {
1113 self.bytes_read
1114 }
1115}
1116
1117/// Parse CSV from a reader into a HEDL document with custom configuration.
1118///
1119/// This is the most flexible CSV parsing function, supporting both custom I/O sources
1120/// and custom parsing configuration.
1121///
1122/// # Arguments
1123///
1124/// * `reader` - Any type implementing `Read`
1125/// * `type_name` - The HEDL type name for rows
1126/// * `schema` - Column names excluding the 'id' column
1127/// * `config` - Configuration controlling all parsing behavior
1128///
1129/// # Examples
1130///
1131/// ## Large File with Custom Limit
1132///
1133/// ```no_run
1134/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
1135/// use std::fs::File;
1136///
1137/// let file = File::open("large_dataset.csv").unwrap();
1138/// let config = FromCsvConfig {
1139/// max_rows: 50_000_000, // 50M rows for high-memory server
1140/// ..Default::default()
1141/// };
1142/// let doc = from_csv_reader_with_config(file, "Record", &["value"], config).unwrap();
1143/// ```
1144///
1145/// ## TSV from Network Stream
1146///
1147/// ```no_run
1148/// use hedl_csv::{from_csv_reader_with_config, FromCsvConfig};
1149/// use std::net::TcpStream;
1150///
1151/// let stream = TcpStream::connect("example.com:8080").unwrap();
1152/// let config = FromCsvConfig {
1153/// delimiter: b'\t',
1154/// ..Default::default()
1155/// };
1156/// let doc = from_csv_reader_with_config(stream, "Data", &["col1", "col2"], config).unwrap();
1157/// ```
1158///
1159/// # Implementation Details
1160///
1161/// The function performs the following steps:
1162///
1163/// 1. Creates a CSV reader with the specified configuration
1164/// 2. Initializes a new HEDL document with version (1, 0)
1165/// 3. Constructs the full schema (ID column + provided columns)
1166/// 4. Registers the struct type in the document
1167/// 5. Iterates through CSV records:
1168/// - Checks row count against `max_rows` security limit
1169/// - Parses each field using type inference
1170/// - Validates field count matches schema
1171/// - Creates `Node` instances and adds to matrix list
1172/// 6. Inserts the completed matrix list into the document
1173///
1174/// # See Also
1175///
1176/// - `from_csv_with_config` - For parsing CSV strings
1177/// - `FromCsvConfig` - Configuration options documentation
1178pub fn from_csv_reader_with_config<R: Read>(
1179 reader: R,
1180 type_name: &str,
1181 schema: &[&str],
1182 config: FromCsvConfig,
1183) -> Result<Document> {
1184 let mut csv_reader = csv::ReaderBuilder::new()
1185 .delimiter(config.delimiter)
1186 .has_headers(config.has_headers)
1187 .trim(if config.trim {
1188 csv::Trim::All
1189 } else {
1190 csv::Trim::None
1191 })
1192 .from_reader(reader);
1193
1194 let mut doc = Document::new((1, 0));
1195
1196 // Create schema with 'id' column
1197 let mut full_schema = vec!["id".to_string()];
1198 full_schema.extend(schema.iter().map(|s| (*s).to_string()));
1199
1200 // Register the struct type
1201 doc.structs
1202 .insert(type_name.to_string(), full_schema.clone());
1203
1204 // Create matrix list
1205 let mut matrix_list = MatrixList::new(type_name, full_schema.clone());
1206
1207 // VALIDATE HEADERS if has_headers is enabled
1208 let headers = csv_reader.headers().map_err(|e| CsvError::ParseError {
1209 line: 0,
1210 message: e.to_string(),
1211 })?;
1212
1213 validate_headers(headers, &config)?;
1214
1215 // Initialize size tracker
1216 let mut size_tracker = CsvSizeTracker::new(config.max_total_size);
1217
1218 // Track header size
1219 let header_size: usize = headers.iter().map(str::len).sum();
1220 size_tracker.bytes_read += header_size;
1221
1222 // If schema inference is enabled, collect records first
1223 let _inferred_types = if config.infer_schema {
1224 // Collect records for sampling
1225 let mut all_records = Vec::new();
1226 for (record_idx, result) in csv_reader.records().enumerate() {
1227 // Security: Limit row count to prevent memory exhaustion
1228 if record_idx >= config.max_rows {
1229 return Err(CsvError::SecurityLimit {
1230 limit: config.max_rows,
1231 actual: record_idx + 1,
1232 });
1233 }
1234
1235 let record = result.map_err(|e| CsvError::ParseError {
1236 line: record_idx + 1,
1237 message: e.to_string(),
1238 })?;
1239
1240 if record.is_empty() {
1241 continue;
1242 }
1243
1244 // VALIDATE TOTAL SIZE
1245 size_tracker.track_record(&record)?;
1246
1247 // VALIDATE EACH CELL
1248 for (col_idx, cell) in record.iter().enumerate() {
1249 validate_cell(cell, record_idx + 1, col_idx, &config)?;
1250 }
1251
1252 // Convert StringRecord to Vec<String>
1253 let row: Vec<String> = record
1254 .iter()
1255 .map(std::string::ToString::to_string)
1256 .collect();
1257 all_records.push(row);
1258 }
1259
1260 // Infer column types from sampled records
1261 let types = infer_column_types(&all_records, config.sample_rows);
1262
1263 // Process all records with inferred types
1264 for (record_idx, row) in all_records.iter().enumerate() {
1265 // First column is the ID
1266 let id = row
1267 .first()
1268 .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
1269
1270 if id.is_empty() {
1271 return Err(CsvError::EmptyId {
1272 row: record_idx + 1,
1273 });
1274 }
1275
1276 // Parse ALL fields (including ID) with inferred types
1277 let mut fields = Vec::new();
1278 for (field_idx, field) in row.iter().enumerate() {
1279 let col_type = types.get(field_idx).copied().unwrap_or(ColumnType::String);
1280 let value = parse_csv_value_with_type(field, col_type).map_err(|e| {
1281 e.with_context(format!(
1282 "in column '{}' at line {}",
1283 full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
1284 record_idx + 1
1285 ))
1286 })?;
1287 fields.push(value);
1288 }
1289
1290 // Check field count matches full schema (including ID)
1291 if fields.len() != full_schema.len() {
1292 return Err(CsvError::WidthMismatch {
1293 expected: full_schema.len(),
1294 actual: fields.len(),
1295 row: record_idx + 1,
1296 });
1297 }
1298
1299 let node = Node::new(type_name, id, fields);
1300 matrix_list.add_row(node);
1301 }
1302
1303 types
1304 } else {
1305 // Standard parsing without schema inference
1306 for (record_idx, result) in csv_reader.records().enumerate() {
1307 // Security: Limit row count to prevent memory exhaustion
1308 if record_idx >= config.max_rows {
1309 return Err(CsvError::SecurityLimit {
1310 limit: config.max_rows,
1311 actual: record_idx + 1,
1312 });
1313 }
1314
1315 let record = result.map_err(|e| CsvError::ParseError {
1316 line: record_idx + 1,
1317 message: e.to_string(),
1318 })?;
1319
1320 if record.is_empty() {
1321 continue;
1322 }
1323
1324 // VALIDATE TOTAL SIZE
1325 size_tracker.track_record(&record)?;
1326
1327 // VALIDATE EACH CELL
1328 for (col_idx, cell) in record.iter().enumerate() {
1329 validate_cell(cell, record_idx + 1, col_idx, &config)?;
1330 }
1331
1332 // First column is the ID
1333 let id = record
1334 .get(0)
1335 .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
1336
1337 if id.is_empty() {
1338 return Err(CsvError::EmptyId {
1339 row: record_idx + 1,
1340 });
1341 }
1342
1343 // Parse ALL fields (including ID) per SPEC
1344 let mut fields = Vec::new();
1345 for (field_idx, field) in record.iter().enumerate() {
1346 let value = parse_csv_value(field).map_err(|e| {
1347 e.with_context(format!(
1348 "in column '{}' at line {}",
1349 full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
1350 record_idx + 1
1351 ))
1352 })?;
1353 fields.push(value);
1354 }
1355
1356 // Check field count matches full schema (including ID)
1357 if fields.len() != full_schema.len() {
1358 return Err(CsvError::WidthMismatch {
1359 expected: full_schema.len(),
1360 actual: fields.len(),
1361 row: record_idx + 1,
1362 });
1363 }
1364
1365 let node = Node::new(type_name, id, fields);
1366 matrix_list.add_row(node);
1367 }
1368
1369 Vec::new()
1370 };
1371
1372 // Add matrix list to document with custom or default key
1373 let list_key = config
1374 .list_key
1375 .unwrap_or_else(|| format!("{}s", type_name.to_lowercase()));
1376
1377 doc.root.insert(list_key, Item::List(matrix_list));
1378
1379 Ok(doc)
1380}
1381
1382/// Parse a CSV field value into a HEDL Value.
1383///
1384/// Type inference rules:
1385/// - Empty string → Null
1386/// - "true" or "false" → Bool
1387/// - Integer pattern → Int
1388/// - Float pattern → Float
1389/// - Reference pattern (@...) → Reference
1390/// - Expression pattern $(...) → Expression
1391/// - Otherwise → String
1392fn parse_csv_value(field: &str) -> Result<Value> {
1393 let trimmed = field.trim();
1394
1395 // Empty or null
1396 if trimmed.is_empty() || trimmed == "~" {
1397 return Ok(Value::Null);
1398 }
1399
1400 // Boolean
1401 if trimmed == "true" {
1402 return Ok(Value::Bool(true));
1403 }
1404 if trimmed == "false" {
1405 return Ok(Value::Bool(false));
1406 }
1407
1408 // Special float values
1409 match trimmed {
1410 "NaN" => return Ok(Value::Float(f64::NAN)),
1411 "Infinity" => return Ok(Value::Float(f64::INFINITY)),
1412 "-Infinity" => return Ok(Value::Float(f64::NEG_INFINITY)),
1413 _ => {}
1414 }
1415
1416 // Reference
1417 if trimmed.starts_with('@') {
1418 return parse_reference(trimmed);
1419 }
1420
1421 // Expression
1422 if trimmed.starts_with("$(") && trimmed.ends_with(')') {
1423 let expr = parse_expression_token(trimmed).map_err(|e| CsvError::ParseError {
1424 line: 0,
1425 message: format!("Invalid expression: {e}"),
1426 })?;
1427 return Ok(Value::Expression(Box::new(expr)));
1428 }
1429
1430 // Try integer
1431 if let Ok(n) = trimmed.parse::<i64>() {
1432 return Ok(Value::Int(n));
1433 }
1434
1435 // Try float
1436 if let Ok(f) = trimmed.parse::<f64>() {
1437 return Ok(Value::Float(f));
1438 }
1439
1440 // Tensor literal (starts with '[' and ends with ']')
1441 if trimmed.starts_with('[') && trimmed.ends_with(']') {
1442 if let Ok(tensor) = parse_tensor(trimmed) {
1443 return Ok(Value::Tensor(Box::new(tensor)));
1444 }
1445 // If parsing fails, fall through to string
1446 }
1447
1448 // Default to string
1449 Ok(Value::String(field.to_string().into()))
1450}
1451
1452/// Parse a reference string (e.g., "@user1" or "@User:user1").
1453fn parse_reference(s: &str) -> Result<Value> {
1454 let without_at = &s[1..];
1455
1456 if let Some(colon_pos) = without_at.find(':') {
1457 // Qualified reference: @Type:id
1458 let type_name = &without_at[..colon_pos];
1459 let id = &without_at[colon_pos + 1..];
1460
1461 if type_name.is_empty() || id.is_empty() {
1462 return Err(CsvError::ParseError {
1463 line: 0,
1464 message: format!("Invalid reference format: {s}"),
1465 });
1466 }
1467
1468 Ok(Value::Reference(hedl_core::Reference::qualified(
1469 type_name, id,
1470 )))
1471 } else {
1472 // Local reference: @id
1473 if without_at.is_empty() {
1474 return Err(CsvError::ParseError {
1475 line: 0,
1476 message: "Empty reference ID".to_string(),
1477 });
1478 }
1479
1480 Ok(Value::Reference(hedl_core::Reference::local(without_at)))
1481 }
1482}
1483
1484#[cfg(test)]
1485mod tests {
1486 use super::*;
1487 use hedl_core::lex::Tensor;
1488 use hedl_test::expr_value;
1489
1490 // ==================== FromCsvConfig tests ====================
1491
1492 #[test]
1493 fn test_from_csv_config_default() {
1494 let config = FromCsvConfig::default();
1495 assert_eq!(config.delimiter, b',');
1496 assert!(config.has_headers);
1497 assert!(config.trim);
1498 assert_eq!(config.max_rows, DEFAULT_MAX_ROWS);
1499 }
1500
1501 #[test]
1502 fn test_from_csv_config_debug() {
1503 let config = FromCsvConfig::default();
1504 let debug = format!("{config:?}");
1505 assert!(debug.contains("FromCsvConfig"));
1506 assert!(debug.contains("delimiter"));
1507 assert!(debug.contains("has_headers"));
1508 assert!(debug.contains("trim"));
1509 }
1510
1511 #[test]
1512 fn test_from_csv_config_clone() {
1513 let config = FromCsvConfig {
1514 delimiter: b'\t',
1515 has_headers: false,
1516 trim: false,
1517 max_rows: 500_000,
1518 infer_schema: false,
1519 sample_rows: 100,
1520 list_key: None,
1521 max_columns: 5_000,
1522 max_cell_size: 2_000_000,
1523 max_total_size: 200_000_000,
1524 max_header_size: 2_000_000,
1525 };
1526 let cloned = config.clone();
1527 assert_eq!(cloned.delimiter, b'\t');
1528 assert!(!cloned.has_headers);
1529 assert!(!cloned.trim);
1530 assert_eq!(cloned.max_rows, 500_000);
1531 assert!(!cloned.infer_schema);
1532 assert_eq!(cloned.sample_rows, 100);
1533 assert_eq!(cloned.list_key, None);
1534 assert_eq!(cloned.max_columns, 5_000);
1535 assert_eq!(cloned.max_cell_size, 2_000_000);
1536 assert_eq!(cloned.max_total_size, 200_000_000);
1537 assert_eq!(cloned.max_header_size, 2_000_000);
1538 }
1539
1540 #[test]
1541 fn test_from_csv_config_all_options() {
1542 let config = FromCsvConfig {
1543 delimiter: b';',
1544 has_headers: true,
1545 trim: true,
1546 max_rows: 2_000_000,
1547 infer_schema: true,
1548 sample_rows: 200,
1549 list_key: Some("custom".to_string()),
1550 max_columns: 15_000,
1551 max_cell_size: 3_000_000,
1552 max_total_size: 300_000_000,
1553 max_header_size: 3_000_000,
1554 };
1555 assert_eq!(config.delimiter, b';');
1556 assert!(config.has_headers);
1557 assert!(config.trim);
1558 assert_eq!(config.max_rows, 2_000_000);
1559 assert!(config.infer_schema);
1560 assert_eq!(config.sample_rows, 200);
1561 assert_eq!(config.list_key, Some("custom".to_string()));
1562 assert_eq!(config.max_columns, 15_000);
1563 assert_eq!(config.max_cell_size, 3_000_000);
1564 assert_eq!(config.max_total_size, 300_000_000);
1565 assert_eq!(config.max_header_size, 3_000_000);
1566 }
1567
1568 #[test]
1569 fn test_max_rows_limit_enforcement() {
1570 // Create CSV with exactly max_rows + 1 rows
1571 let mut csv_data = String::from("id,value\n");
1572 let max_rows = 100;
1573 for i in 0..=max_rows {
1574 csv_data.push_str(&format!("{i},test{i}\n"));
1575 }
1576
1577 let config = FromCsvConfig {
1578 max_rows,
1579 infer_schema: false,
1580 sample_rows: 100,
1581 ..Default::default()
1582 };
1583
1584 let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1585 assert!(result.is_err());
1586 let err = result.unwrap_err();
1587 assert!(matches!(err, CsvError::SecurityLimit { .. }));
1588 assert!(err.to_string().contains("Security limit"));
1589 assert!(err.to_string().contains(&max_rows.to_string()));
1590 }
1591
1592 #[test]
1593 fn test_max_rows_limit_not_exceeded() {
1594 // Create CSV with exactly max_rows rows
1595 let mut csv_data = String::from("id,value\n");
1596 let max_rows = 100;
1597 for i in 0..(max_rows - 1) {
1598 csv_data.push_str(&format!("{i},test{i}\n"));
1599 }
1600
1601 let config = FromCsvConfig {
1602 max_rows,
1603 infer_schema: false,
1604 sample_rows: 100,
1605 ..Default::default()
1606 };
1607
1608 let result = from_csv_with_config(&csv_data, "Item", &["value"], config);
1609 assert!(result.is_ok());
1610 let doc = result.unwrap();
1611 let list = doc.get("items").unwrap().as_list().unwrap();
1612 assert_eq!(list.rows.len(), max_rows - 1);
1613 }
1614
1615 // ==================== from_csv basic tests ====================
1616
1617 #[test]
1618 fn test_from_csv_basic() {
1619 let csv_data = "id,name,age,active\n1,Alice,30,true\n2,Bob,25,false\n";
1620 let doc = from_csv(csv_data, "Person", &["name", "age", "active"]).unwrap();
1621
1622 // Check document structure
1623 assert_eq!(doc.version, (1, 0));
1624
1625 // Check schema registration
1626 let schema = doc.get_schema("Person").unwrap();
1627 assert_eq!(schema, &["id", "name", "age", "active"]);
1628
1629 // Check matrix list
1630 let item = doc.get("persons").unwrap();
1631 let list = item.as_list().unwrap();
1632 assert_eq!(list.type_name, "Person");
1633 assert_eq!(list.rows.len(), 2);
1634
1635 // Check first row
1636 let row1 = &list.rows[0];
1637 assert_eq!(row1.id, "1");
1638 assert_eq!(row1.fields.len(), schema.len()); // schema includes ID
1639 assert_eq!(row1.fields[0], Value::Int(1)); // ID field
1640 assert_eq!(row1.fields[1], Value::String("Alice".into()));
1641 assert_eq!(row1.fields[2], Value::Int(30));
1642 assert_eq!(row1.fields[3], Value::Bool(true));
1643
1644 // Check second row
1645 let row2 = &list.rows[1];
1646 assert_eq!(row2.id, "2");
1647 assert_eq!(row2.fields.len(), schema.len()); // schema includes ID
1648 assert_eq!(row2.fields[0], Value::Int(2)); // ID field
1649 assert_eq!(row2.fields[1], Value::String("Bob".into()));
1650 assert_eq!(row2.fields[2], Value::Int(25));
1651 assert_eq!(row2.fields[3], Value::Bool(false));
1652 }
1653
1654 #[test]
1655 fn test_from_csv_without_headers() {
1656 let csv_data = "1,Alice,30\n2,Bob,25\n";
1657 let config = FromCsvConfig {
1658 has_headers: false,
1659 ..Default::default()
1660 };
1661 let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1662
1663 let item = doc.get("persons").unwrap();
1664 let list = item.as_list().unwrap();
1665 assert_eq!(list.rows.len(), 2);
1666 }
1667
1668 #[test]
1669 fn test_from_csv_custom_delimiter() {
1670 let csv_data = "id\tname\tage\n1\tAlice\t30\n2\tBob\t25\n";
1671 let config = FromCsvConfig {
1672 delimiter: b'\t',
1673 ..Default::default()
1674 };
1675 let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1676
1677 let item = doc.get("persons").unwrap();
1678 let list = item.as_list().unwrap();
1679 assert_eq!(list.rows.len(), 2);
1680 }
1681
1682 #[test]
1683 fn test_from_csv_semicolon_delimiter() {
1684 let csv_data = "id;name;age\n1;Alice;30\n";
1685 let config = FromCsvConfig {
1686 delimiter: b';',
1687 ..Default::default()
1688 };
1689 let doc = from_csv_with_config(csv_data, "Person", &["name", "age"], config).unwrap();
1690
1691 let item = doc.get("persons").unwrap();
1692 let list = item.as_list().unwrap();
1693 assert_eq!(list.rows.len(), 1);
1694 assert_eq!(list.rows[0].fields[1], Value::String("Alice".into()));
1695 }
1696
1697 #[test]
1698 fn test_from_csv_empty_file() {
1699 let csv_data = "id,name\n";
1700 let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1701
1702 let item = doc.get("persons").unwrap();
1703 let list = item.as_list().unwrap();
1704 assert!(list.rows.is_empty());
1705 }
1706
1707 #[test]
1708 fn test_from_csv_single_row() {
1709 let csv_data = "id,name\n1,Alice\n";
1710 let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
1711
1712 let item = doc.get("persons").unwrap();
1713 let list = item.as_list().unwrap();
1714 assert_eq!(list.rows.len(), 1);
1715 }
1716
1717 // ==================== parse_csv_value tests ====================
1718
1719 #[test]
1720 fn test_parse_csv_value_null_empty() {
1721 assert_eq!(parse_csv_value("").unwrap(), Value::Null);
1722 }
1723
1724 #[test]
1725 fn test_parse_csv_value_null_tilde() {
1726 assert_eq!(parse_csv_value("~").unwrap(), Value::Null);
1727 }
1728
1729 #[test]
1730 fn test_parse_csv_value_null_whitespace() {
1731 assert_eq!(parse_csv_value(" ").unwrap(), Value::Null);
1732 }
1733
1734 #[test]
1735 fn test_parse_csv_value_bool_true() {
1736 assert_eq!(parse_csv_value("true").unwrap(), Value::Bool(true));
1737 }
1738
1739 #[test]
1740 fn test_parse_csv_value_bool_false() {
1741 assert_eq!(parse_csv_value("false").unwrap(), Value::Bool(false));
1742 }
1743
1744 #[test]
1745 fn test_parse_csv_value_int_positive() {
1746 assert_eq!(parse_csv_value("42").unwrap(), Value::Int(42));
1747 }
1748
1749 #[test]
1750 fn test_parse_csv_value_int_negative() {
1751 assert_eq!(parse_csv_value("-123").unwrap(), Value::Int(-123));
1752 }
1753
1754 #[test]
1755 fn test_parse_csv_value_int_zero() {
1756 assert_eq!(parse_csv_value("0").unwrap(), Value::Int(0));
1757 }
1758
1759 #[test]
1760 fn test_parse_csv_value_int_large() {
1761 assert_eq!(
1762 parse_csv_value("9223372036854775807").unwrap(),
1763 Value::Int(i64::MAX)
1764 );
1765 }
1766
1767 #[test]
1768 fn test_parse_csv_value_float_positive() {
1769 assert_eq!(parse_csv_value("3.25").unwrap(), Value::Float(3.25));
1770 }
1771
1772 #[test]
1773 fn test_parse_csv_value_float_negative() {
1774 assert_eq!(parse_csv_value("-2.5").unwrap(), Value::Float(-2.5));
1775 }
1776
1777 #[test]
1778 fn test_parse_csv_value_float_zero() {
1779 assert_eq!(parse_csv_value("0.0").unwrap(), Value::Float(0.0));
1780 }
1781
1782 #[test]
1783 fn test_parse_csv_value_float_scientific() {
1784 let val = parse_csv_value("1.5e10").unwrap();
1785 if let Value::Float(f) = val {
1786 assert!((f - 1.5e10).abs() < 1e5);
1787 } else {
1788 panic!("Expected float");
1789 }
1790 }
1791
1792 #[test]
1793 fn test_parse_csv_value_string() {
1794 assert_eq!(
1795 parse_csv_value("hello").unwrap(),
1796 Value::String("hello".into())
1797 );
1798 }
1799
1800 #[test]
1801 fn test_parse_csv_value_string_with_spaces() {
1802 assert_eq!(
1803 parse_csv_value(" hello world ").unwrap(),
1804 Value::String(" hello world ".into())
1805 );
1806 }
1807
1808 #[test]
1809 fn test_parse_csv_value_string_numeric_looking() {
1810 // Strings that look like numbers but have leading zeros
1811 assert_eq!(
1812 parse_csv_value("007").unwrap(),
1813 Value::Int(7) // Parsed as int
1814 );
1815 }
1816
1817 // ==================== Special float values ====================
1818
1819 #[test]
1820 fn test_parse_csv_value_nan() {
1821 let nan = parse_csv_value("NaN").unwrap();
1822 assert!(matches!(nan, Value::Float(f) if f.is_nan()));
1823 }
1824
1825 #[test]
1826 fn test_parse_csv_value_infinity() {
1827 let inf = parse_csv_value("Infinity").unwrap();
1828 assert_eq!(inf, Value::Float(f64::INFINITY));
1829 }
1830
1831 #[test]
1832 fn test_parse_csv_value_neg_infinity() {
1833 let neg_inf = parse_csv_value("-Infinity").unwrap();
1834 assert_eq!(neg_inf, Value::Float(f64::NEG_INFINITY));
1835 }
1836
1837 // ==================== Reference tests ====================
1838
1839 #[test]
1840 fn test_parse_csv_value_reference_local() {
1841 let ref_val = parse_csv_value("@user1").unwrap();
1842 if let Value::Reference(r) = ref_val {
1843 assert_eq!(&*r.id, "user1");
1844 assert_eq!(r.type_name, None);
1845 } else {
1846 panic!("Expected reference");
1847 }
1848 }
1849
1850 #[test]
1851 fn test_parse_csv_value_reference_qualified() {
1852 let ref_val = parse_csv_value("@User:user1").unwrap();
1853 if let Value::Reference(r) = ref_val {
1854 assert_eq!(&*r.id, "user1");
1855 assert_eq!(r.type_name.as_deref(), Some("User"));
1856 } else {
1857 panic!("Expected reference");
1858 }
1859 }
1860
1861 #[test]
1862 fn test_parse_csv_value_reference_with_dashes() {
1863 let ref_val = parse_csv_value("@my-item-123").unwrap();
1864 if let Value::Reference(r) = ref_val {
1865 assert_eq!(&*r.id, "my-item-123");
1866 } else {
1867 panic!("Expected reference");
1868 }
1869 }
1870
1871 #[test]
1872 fn test_parse_reference_empty_error() {
1873 let result = parse_reference("@");
1874 assert!(result.is_err());
1875 assert!(result
1876 .unwrap_err()
1877 .to_string()
1878 .contains("Empty reference ID"));
1879 }
1880
1881 #[test]
1882 fn test_parse_reference_empty_type_error() {
1883 let result = parse_reference("@:id");
1884 assert!(result.is_err());
1885 assert!(result
1886 .unwrap_err()
1887 .to_string()
1888 .contains("Invalid reference format"));
1889 }
1890
1891 #[test]
1892 fn test_parse_reference_empty_id_error() {
1893 let result = parse_reference("@Type:");
1894 assert!(result.is_err());
1895 assert!(result
1896 .unwrap_err()
1897 .to_string()
1898 .contains("Invalid reference format"));
1899 }
1900
1901 // ==================== Expression tests ====================
1902
1903 #[test]
1904 fn test_parse_csv_value_expression_identifier() {
1905 let expr = parse_csv_value("$(foo)").unwrap();
1906 assert_eq!(expr, expr_value("foo"));
1907 }
1908
1909 #[test]
1910 fn test_parse_csv_value_expression_call() {
1911 let expr = parse_csv_value("$(add(x, y))").unwrap();
1912 assert_eq!(expr, expr_value("add(x, y)"));
1913 }
1914
1915 #[test]
1916 fn test_parse_csv_value_expression_nested() {
1917 let expr = parse_csv_value("$(outer(inner(x)))").unwrap();
1918 if let Value::Expression(e) = expr {
1919 assert_eq!(e.to_string(), "outer(inner(x))");
1920 } else {
1921 panic!("Expected expression");
1922 }
1923 }
1924
1925 // ==================== Tensor tests ====================
1926
1927 #[test]
1928 fn test_parse_csv_value_tensor_1d() {
1929 let val = parse_csv_value("[1, 2, 3]").unwrap();
1930 if let Value::Tensor(tensor) = val {
1931 if let Tensor::Array(arr) = tensor.as_ref() {
1932 assert_eq!(arr.len(), 3);
1933 } else {
1934 panic!("Expected tensor array");
1935 }
1936 } else {
1937 panic!("Expected tensor");
1938 }
1939 }
1940
1941 #[test]
1942 fn test_parse_csv_value_tensor_2d() {
1943 let val = parse_csv_value("[[1, 2], [3, 4]]").unwrap();
1944 if let Value::Tensor(tensor) = val {
1945 if let Tensor::Array(outer) = tensor.as_ref() {
1946 assert_eq!(outer.len(), 2);
1947 if let Tensor::Array(inner) = &outer[0] {
1948 assert_eq!(inner.len(), 2);
1949 } else {
1950 panic!("Expected nested array");
1951 }
1952 } else {
1953 panic!("Expected tensor array");
1954 }
1955 } else {
1956 panic!("Expected tensor");
1957 }
1958 }
1959
1960 #[test]
1961 fn test_parse_csv_value_tensor_empty_is_string() {
1962 // Empty tensors are not valid in HEDL (must have at least one element)
1963 // So "[]" falls through to being treated as a string
1964 let val = parse_csv_value("[]").unwrap();
1965 assert_eq!(val, Value::String("[]".into()));
1966 }
1967
1968 // ==================== Error cases ====================
1969
1970 #[test]
1971 fn test_empty_id_error() {
1972 let csv_data = "id,name\n,Alice\n";
1973 let result = from_csv(csv_data, "Person", &["name"]);
1974 assert!(result.is_err());
1975 assert!(matches!(result.unwrap_err(), CsvError::EmptyId { .. }));
1976 }
1977
1978 #[test]
1979 fn test_mismatched_field_count() {
1980 let csv_data = "id,name,age\n1,Alice\n";
1981 let result = from_csv(csv_data, "Person", &["name", "age"]);
1982 assert!(result.is_err());
1983 // CSV parser returns Syntax error for malformed records
1984 assert!(matches!(result.unwrap_err(), CsvError::ParseError { .. }));
1985 }
1986
1987 // ==================== Whitespace handling ====================
1988
1989 #[test]
1990 fn test_whitespace_trimming_enabled() {
1991 let csv_data = "id,name,age\n1, Alice , 30 \n";
1992 let doc = from_csv(csv_data, "Person", &["name", "age"]).unwrap();
1993
1994 let item = doc.get("persons").unwrap();
1995 let list = item.as_list().unwrap();
1996 let row = &list.rows[0];
1997
1998 assert_eq!(row.fields[0], Value::Int(1)); // ID field
1999 assert_eq!(row.fields[1], Value::String("Alice".into()));
2000 assert_eq!(row.fields[2], Value::Int(30));
2001 }
2002
2003 #[test]
2004 fn test_whitespace_trimming_disabled() {
2005 let csv_data = "id,name\n1, Alice \n";
2006 let config = FromCsvConfig {
2007 trim: false,
2008 ..Default::default()
2009 };
2010 let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2011
2012 let item = doc.get("persons").unwrap();
2013 let list = item.as_list().unwrap();
2014 // With trim disabled, whitespace is preserved
2015 assert_eq!(list.rows[0].fields[1], Value::String(" Alice ".into()));
2016 }
2017
2018 // ==================== from_csv_reader tests ====================
2019
2020 #[test]
2021 fn test_from_csv_reader_basic() {
2022 let csv_data = "id,name\n1,Alice\n".as_bytes();
2023 let doc = from_csv_reader(csv_data, "Person", &["name"]).unwrap();
2024
2025 let item = doc.get("persons").unwrap();
2026 let list = item.as_list().unwrap();
2027 assert_eq!(list.rows.len(), 1);
2028 }
2029
2030 #[test]
2031 fn test_from_csv_reader_with_config() {
2032 let csv_data = "1\tAlice\n".as_bytes();
2033 let config = FromCsvConfig {
2034 delimiter: b'\t',
2035 has_headers: false,
2036 trim: true,
2037 ..Default::default()
2038 };
2039 let doc = from_csv_reader_with_config(csv_data, "Person", &["name"], config).unwrap();
2040
2041 let item = doc.get("persons").unwrap();
2042 let list = item.as_list().unwrap();
2043 assert_eq!(list.rows.len(), 1);
2044 }
2045
2046 // ==================== Type naming tests ====================
2047
2048 #[test]
2049 fn test_type_naming_singularization() {
2050 let csv_data = "id,name\n1,Alice\n";
2051 let doc = from_csv(csv_data, "User", &["name"]).unwrap();
2052
2053 // Matrix list should use "users" as key (lowercase + pluralized)
2054 let item = doc.get("users").unwrap();
2055 let list = item.as_list().unwrap();
2056 assert_eq!(list.type_name, "User");
2057 }
2058
2059 // ==================== Quoted fields ====================
2060
2061 #[test]
2062 fn test_quoted_fields() {
2063 let csv_data = "id,name,bio\n1,Alice,\"Hello, World\"\n";
2064 let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
2065
2066 let item = doc.get("persons").unwrap();
2067 let list = item.as_list().unwrap();
2068 assert_eq!(list.rows[0].fields[2], Value::String("Hello, World".into()));
2069 }
2070
2071 #[test]
2072 fn test_quoted_fields_with_newline() {
2073 let csv_data = "id,name,bio\n1,Alice,\"Line 1\nLine 2\"\n";
2074 let doc = from_csv(csv_data, "Person", &["name", "bio"]).unwrap();
2075
2076 let item = doc.get("persons").unwrap();
2077 let list = item.as_list().unwrap();
2078 assert_eq!(
2079 list.rows[0].fields[2],
2080 Value::String("Line 1\nLine 2".into())
2081 );
2082 }
2083
2084 #[test]
2085 fn test_quoted_fields_with_quotes() {
2086 let csv_data = "id,name\n1,\"Alice \"\"Bob\"\" Smith\"\n";
2087 let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2088
2089 let item = doc.get("persons").unwrap();
2090 let list = item.as_list().unwrap();
2091 assert_eq!(
2092 list.rows[0].fields[1],
2093 Value::String("Alice \"Bob\" Smith".into())
2094 );
2095 }
2096
2097 // ==================== Edge cases ====================
2098
2099 #[test]
2100 fn test_unicode_values() {
2101 let csv_data = "id,name\n1,héllo 世界\n";
2102 let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2103
2104 let item = doc.get("persons").unwrap();
2105 let list = item.as_list().unwrap();
2106 assert_eq!(list.rows[0].fields[1], Value::String("héllo 世界".into()));
2107 }
2108
2109 #[test]
2110 fn test_string_id() {
2111 let csv_data = "id,name\nabc,Alice\n";
2112 let doc = from_csv(csv_data, "Person", &["name"]).unwrap();
2113
2114 let item = doc.get("persons").unwrap();
2115 let list = item.as_list().unwrap();
2116 assert_eq!(list.rows[0].id, "abc");
2117 assert_eq!(list.rows[0].fields[0], Value::String("abc".into()));
2118 }
2119
2120 #[test]
2121 fn test_many_columns() {
2122 let csv_data = "id,a,b,c,d,e\n1,2,3,4,5,6\n";
2123 let doc = from_csv(csv_data, "Item", &["a", "b", "c", "d", "e"]).unwrap();
2124
2125 let item = doc.get("items").unwrap();
2126 let list = item.as_list().unwrap();
2127 assert_eq!(list.schema.len(), 6); // id + 5 columns
2128 assert_eq!(list.rows[0].fields.len(), 6);
2129 }
2130
2131 // ==================== Custom list_key tests ====================
2132
2133 #[test]
2134 fn test_custom_list_key_basic() {
2135 let csv_data = "id,name\n1,Alice\n";
2136 let config = FromCsvConfig {
2137 list_key: Some("people".to_string()),
2138 ..Default::default()
2139 };
2140 let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2141
2142 // Custom plural should exist
2143 assert!(doc.get("people").is_some());
2144 // Default plural should not exist
2145 assert!(doc.get("persons").is_none());
2146
2147 let list = doc.get("people").unwrap().as_list().unwrap();
2148 assert_eq!(list.type_name, "Person");
2149 assert_eq!(list.rows.len(), 1);
2150 }
2151
2152 #[test]
2153 fn test_custom_list_key_irregular_plurals() {
2154 // Test common irregular plurals
2155 let test_cases = vec![
2156 ("Person", "people"),
2157 ("Child", "children"),
2158 ("Tooth", "teeth"),
2159 ("Foot", "feet"),
2160 ("Mouse", "mice"),
2161 ("Goose", "geese"),
2162 ("Man", "men"),
2163 ("Woman", "women"),
2164 ("Ox", "oxen"),
2165 ("Datum", "data"),
2166 ];
2167
2168 for (type_name, plural) in test_cases {
2169 let csv_data = "id,value\n1,test\n".to_string();
2170 let config = FromCsvConfig {
2171 list_key: Some(plural.to_string()),
2172 ..Default::default()
2173 };
2174 let doc = from_csv_with_config(&csv_data, type_name, &["value"], config).unwrap();
2175
2176 assert!(
2177 doc.get(plural).is_some(),
2178 "Failed to find {plural} for type {type_name}"
2179 );
2180 }
2181 }
2182
2183 #[test]
2184 fn test_custom_list_key_collective_nouns() {
2185 let csv_data = "id,value\n1,42\n";
2186
2187 // Test collective nouns
2188 let test_cases = vec![
2189 ("Data", "dataset"),
2190 ("Information", "info_collection"),
2191 ("Equipment", "gear"),
2192 ("Furniture", "furnishings"),
2193 ];
2194
2195 for (type_name, collective) in test_cases {
2196 let config = FromCsvConfig {
2197 list_key: Some(collective.to_string()),
2198 ..Default::default()
2199 };
2200 let doc = from_csv_with_config(csv_data, type_name, &["value"], config).unwrap();
2201
2202 assert!(
2203 doc.get(collective).is_some(),
2204 "Failed to find {collective} for type {type_name}"
2205 );
2206 }
2207 }
2208
2209 #[test]
2210 fn test_custom_list_key_case_sensitive() {
2211 let csv_data = "id,value\n1,test\n";
2212 let config = FromCsvConfig {
2213 list_key: Some("MyCustomList".to_string()),
2214 ..Default::default()
2215 };
2216 let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2217
2218 // Exact case should exist
2219 assert!(doc.get("MyCustomList").is_some());
2220 // Different case should not exist
2221 assert!(doc.get("mycustomlist").is_none());
2222 assert!(doc.get("items").is_none());
2223 }
2224
2225 #[test]
2226 fn test_custom_list_key_empty_string() {
2227 // Empty string is technically allowed as a key
2228 let csv_data = "id,value\n1,test\n";
2229 let config = FromCsvConfig {
2230 list_key: Some(String::new()),
2231 ..Default::default()
2232 };
2233 let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2234
2235 assert!(doc.get("").is_some());
2236 }
2237
2238 #[test]
2239 fn test_custom_list_key_with_special_chars() {
2240 let csv_data = "id,value\n1,test\n";
2241 let config = FromCsvConfig {
2242 list_key: Some("my-custom_list.v2".to_string()),
2243 ..Default::default()
2244 };
2245 let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2246
2247 assert!(doc.get("my-custom_list.v2").is_some());
2248 }
2249
2250 #[test]
2251 fn test_custom_list_key_unicode() {
2252 let csv_data = "id,value\n1,test\n";
2253 let config = FromCsvConfig {
2254 list_key: Some("人々".to_string()), // Japanese for "people"
2255 ..Default::default()
2256 };
2257 let doc = from_csv_with_config(csv_data, "Person", &["value"], config).unwrap();
2258
2259 assert!(doc.get("人々").is_some());
2260 }
2261
2262 #[test]
2263 fn test_custom_list_key_with_schema_inference() {
2264 let csv_data = "id,value\n1,42\n2,43\n3,44\n";
2265 let config = FromCsvConfig {
2266 list_key: Some("people".to_string()),
2267 infer_schema: true,
2268 sample_rows: 10,
2269 ..Default::default()
2270 };
2271 let doc = from_csv_with_config(csv_data, "Person", &["value"], config).unwrap();
2272
2273 assert!(doc.get("people").is_some());
2274 let list = doc.get("people").unwrap().as_list().unwrap();
2275 assert_eq!(list.rows.len(), 3);
2276 // Schema inference should still work
2277 assert_eq!(list.rows[0].fields[1], Value::Int(42));
2278 }
2279
2280 #[test]
2281 fn test_custom_list_key_none_uses_default() {
2282 let csv_data = "id,name\n1,Alice\n";
2283 let config = FromCsvConfig {
2284 list_key: None,
2285 ..Default::default()
2286 };
2287 let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2288
2289 // Should use default pluralization
2290 assert!(doc.get("persons").is_some());
2291 assert!(doc.get("people").is_none());
2292 }
2293
2294 #[test]
2295 fn test_custom_list_key_default_config() {
2296 let csv_data = "id,name\n1,Alice\n";
2297 let doc = from_csv(csv_data, "User", &["name"]).unwrap();
2298
2299 // Default should use simple pluralization
2300 assert!(doc.get("users").is_some());
2301 }
2302
2303 #[test]
2304 fn test_custom_list_key_preserves_type_name() {
2305 let csv_data = "id,name\n1,Alice\n";
2306 let config = FromCsvConfig {
2307 list_key: Some("people".to_string()),
2308 ..Default::default()
2309 };
2310 let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2311
2312 let list = doc.get("people").unwrap().as_list().unwrap();
2313 // Type name should still be "Person", not "people"
2314 assert_eq!(list.type_name, "Person");
2315 }
2316
2317 #[test]
2318 fn test_custom_list_key_with_multiple_types() {
2319 // This test ensures each call can have its own list_key
2320 let csv1 = "id,name\n1,Alice\n";
2321 let config1 = FromCsvConfig {
2322 list_key: Some("people".to_string()),
2323 ..Default::default()
2324 };
2325 let doc1 = from_csv_with_config(csv1, "Person", &["name"], config1).unwrap();
2326
2327 let csv2 = "id,name\n1,Fluffy\n";
2328 let config2 = FromCsvConfig {
2329 list_key: Some("mice".to_string()),
2330 ..Default::default()
2331 };
2332 let doc2 = from_csv_with_config(csv2, "Mouse", &["name"], config2).unwrap();
2333
2334 assert!(doc1.get("people").is_some());
2335 assert!(doc1.get("persons").is_none());
2336
2337 assert!(doc2.get("mice").is_some());
2338 assert!(doc2.get("mouses").is_none());
2339 }
2340
2341 #[test]
2342 fn test_custom_list_key_numbers_in_name() {
2343 let csv_data = "id,value\n1,test\n";
2344 let config = FromCsvConfig {
2345 list_key: Some("items_v2".to_string()),
2346 ..Default::default()
2347 };
2348 let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
2349
2350 assert!(doc.get("items_v2").is_some());
2351 }
2352
2353 #[test]
2354 fn test_custom_list_key_round_trip_compatibility() {
2355 // Ensure custom list keys work with to_csv_list
2356 let csv_data = "id,name\n1,Alice\n2,Bob\n";
2357 let config = FromCsvConfig {
2358 list_key: Some("people".to_string()),
2359 ..Default::default()
2360 };
2361 let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
2362
2363 // Export the list using the custom key
2364 use crate::to_csv_list;
2365 let exported_csv = to_csv_list(&doc, "people").unwrap();
2366 assert!(exported_csv.contains("Alice"));
2367 assert!(exported_csv.contains("Bob"));
2368
2369 // Should not be accessible via default key
2370 assert!(to_csv_list(&doc, "persons").is_err());
2371 }
2372
2373 #[test]
2374 fn test_from_csv_config_clone_with_list_key() {
2375 let config = FromCsvConfig {
2376 delimiter: b',',
2377 has_headers: true,
2378 trim: true,
2379 max_rows: 1000,
2380 infer_schema: false,
2381 sample_rows: 50,
2382 list_key: Some("people".to_string()),
2383 max_columns: DEFAULT_MAX_COLUMNS,
2384 max_cell_size: DEFAULT_MAX_CELL_SIZE,
2385 max_total_size: DEFAULT_MAX_TOTAL_SIZE,
2386 max_header_size: DEFAULT_MAX_HEADER_SIZE,
2387 };
2388 let cloned = config.clone();
2389 assert_eq!(cloned.list_key, Some("people".to_string()));
2390 }
2391
2392 #[test]
2393 fn test_from_csv_config_debug_with_list_key() {
2394 let config = FromCsvConfig {
2395 list_key: Some("people".to_string()),
2396 ..Default::default()
2397 };
2398 let debug = format!("{config:?}");
2399 assert!(debug.contains("list_key"));
2400 assert!(debug.contains("people"));
2401 }
2402
2403 // ==================== Security Limit Tests ====================
2404
2405 #[test]
2406 fn test_from_csv_config_default_security_limits() {
2407 let config = FromCsvConfig::default();
2408 assert_eq!(config.max_columns, DEFAULT_MAX_COLUMNS);
2409 assert_eq!(config.max_cell_size, DEFAULT_MAX_CELL_SIZE);
2410 assert_eq!(config.max_total_size, DEFAULT_MAX_TOTAL_SIZE);
2411 assert_eq!(config.max_header_size, DEFAULT_MAX_HEADER_SIZE);
2412 }
2413
2414 #[test]
2415 fn test_from_csv_config_clone_with_security_limits() {
2416 let config = FromCsvConfig {
2417 max_columns: 5_000,
2418 max_cell_size: 2_000_000,
2419 max_total_size: 200_000_000,
2420 max_header_size: 2_000_000,
2421 ..Default::default()
2422 };
2423 let cloned = config.clone();
2424 assert_eq!(cloned.max_columns, 5_000);
2425 assert_eq!(cloned.max_cell_size, 2_000_000);
2426 assert_eq!(cloned.max_total_size, 200_000_000);
2427 assert_eq!(cloned.max_header_size, 2_000_000);
2428 }
2429
2430 #[test]
2431 fn test_from_csv_config_unlimited() {
2432 let config = FromCsvConfig::unlimited();
2433 assert_eq!(config.max_rows, usize::MAX);
2434 assert_eq!(config.max_columns, usize::MAX);
2435 assert_eq!(config.max_cell_size, usize::MAX);
2436 assert_eq!(config.max_total_size, usize::MAX);
2437 assert_eq!(config.max_header_size, usize::MAX);
2438 }
2439
2440 #[test]
2441 fn test_from_csv_config_strict() {
2442 let config = FromCsvConfig::strict();
2443 assert_eq!(config.max_rows, 1_000_000);
2444 assert_eq!(config.max_columns, 1_000);
2445 assert_eq!(config.max_cell_size, 65_536);
2446 assert_eq!(config.max_total_size, 10_485_760);
2447 assert_eq!(config.max_header_size, 65_536);
2448 }
2449
2450 #[test]
2451 fn test_column_count_limit_enforcement() {
2452 // Create CSV with 11,000 columns (exceeds default 10,000)
2453 let mut csv = String::from("col0");
2454 for i in 1..11_000 {
2455 csv.push_str(&format!(",col{i}"));
2456 }
2457 csv.push('\n');
2458 csv.push_str("a,");
2459 csv.push_str(&"b,".repeat(10_999));
2460 csv.push('b');
2461
2462 let result = from_csv_with_config(&csv, "Item", &[], FromCsvConfig::default());
2463
2464 assert!(result.is_err());
2465 let err = result.unwrap_err();
2466 assert!(matches!(err, CsvError::Security { .. }));
2467 assert!(err.to_string().contains("exceeds limit"));
2468 }
2469
2470 #[test]
2471 fn test_cell_size_limit_enforcement() {
2472 // Create CSV with 2MB cell (exceeds default 1MB)
2473 let huge_cell = "x".repeat(2_000_000);
2474 let csv = format!("id,data\n1,\"{huge_cell}\"\n");
2475
2476 let result = from_csv_with_config(&csv, "Item", &["data"], FromCsvConfig::default());
2477
2478 assert!(result.is_err());
2479 let err = result.unwrap_err();
2480 assert!(matches!(err, CsvError::Security { .. }));
2481 // Check that the error message contains information about the limit
2482 let err_msg = err.to_string();
2483 assert!(err_msg.contains("exceeds limit") || err_msg.contains("Security"));
2484 }
2485
2486 #[test]
2487 fn test_total_size_limit_enforcement() {
2488 // Create CSV with 110MB total data (exceeds default 100MB)
2489 let mut csv = String::from("id,data\n");
2490 let row_data = "x".repeat(100_000); // 100KB per row
2491
2492 for i in 0..1_100 {
2493 csv.push_str(&format!("{i},\"{row_data}\"\n"));
2494 }
2495
2496 let result = from_csv_with_config(&csv, "Item", &["data"], FromCsvConfig::default());
2497
2498 assert!(result.is_err());
2499 let err = result.unwrap_err();
2500 assert!(matches!(err, CsvError::Security { .. }));
2501 assert!(err.to_string().contains("total size"));
2502 }
2503
2504 #[test]
2505 fn test_header_size_limit_enforcement() {
2506 // Create CSV with 2MB total header size (exceeds default 1MB)
2507 let mut csv = String::new();
2508 for i in 0..20_000 {
2509 if i > 0 {
2510 csv.push(',');
2511 }
2512 csv.push_str(&format!("column_{i}_very_long_name_{i}"));
2513 }
2514 csv.push_str("\n1\n");
2515
2516 let result = from_csv_with_config(&csv, "Item", &[], FromCsvConfig::default());
2517
2518 assert!(result.is_err());
2519 let err = result.unwrap_err();
2520 assert!(matches!(err, CsvError::Security { .. }));
2521 // Check that the error message contains information about the limit
2522 let err_msg = err.to_string();
2523 assert!(err_msg.contains("exceeds limit") || err_msg.contains("Security"));
2524 }
2525
2526 #[test]
2527 fn test_normal_csv_within_limits() {
2528 // Normal CSV should work fine with default limits
2529 let csv_data = "id,name,age\n1,Alice,30\n2,Bob,25\n";
2530
2531 let result = from_csv_with_config(
2532 csv_data,
2533 "Person",
2534 &["name", "age"],
2535 FromCsvConfig::default(),
2536 );
2537
2538 assert!(result.is_ok());
2539 }
2540
2541 #[test]
2542 #[allow(clippy::needless_borrow)]
2543 fn test_unlimited_config_allows_large_csvs() {
2544 // Verify that unlimited() config allows huge CSVs
2545 let huge_cell = "x".repeat(10_000_000);
2546 let csv = format!("id,data\n1,\"{huge_cell}\"\n");
2547
2548 let config = FromCsvConfig::unlimited();
2549 let result = from_csv_with_config(&csv, "Item", &["data"], config);
2550
2551 // Should succeed
2552 assert!(result.is_ok());
2553 }
2554
2555 #[test]
2556 #[allow(clippy::needless_borrow)]
2557 fn test_strict_config_blocks_large_cells() {
2558 // Even a moderately large cell should fail with strict config
2559 let csv = format!("id,data\n1,\"{}\"\n", "x".repeat(100_000));
2560
2561 let config = FromCsvConfig::strict();
2562 let result = from_csv_with_config(&csv, "Item", &["data"], config);
2563
2564 // Should fail - 100KB exceeds strict 64KB limit
2565 assert!(result.is_err());
2566 assert!(matches!(result.unwrap_err(), CsvError::Security { .. }));
2567 }
2568
2569 #[test]
2570 #[allow(clippy::needless_borrow)]
2571 fn test_strict_config_allows_small_csvs() {
2572 // Small CSV should work with strict config
2573 let csv = "id,data\n1,small_data\n";
2574
2575 let config = FromCsvConfig::strict();
2576 let result = from_csv_with_config(&csv, "Item", &["data"], config);
2577
2578 assert!(result.is_ok());
2579 }
2580}