Skip to main content

hedl_csv/from_csv/
config.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6
7//! Configuration types for CSV import
8
9///
10/// This limit prevents Denial-of-Service attacks from maliciously large CSV files.
11/// The default is 1 million rows, which allows processing reasonably large datasets
12/// while preventing unbounded memory allocation.
13///
14/// # Security Considerations
15///
16/// - **Memory exhaustion**: Without a limit, attackers could provide CSV files with
17///   billions of rows, causing the application to allocate excessive memory and crash.
18/// - **Configurable**: The limit can be adjusted via `FromCsvConfig::max_rows` based on
19///   deployment context and available resources.
20/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
21///
22/// # Examples
23///
24/// ```
25/// # use hedl_csv::FromCsvConfig;
26/// // Use default 1M row limit
27/// let config = FromCsvConfig::default();
28/// assert_eq!(config.max_rows, 1_000_000);
29///
30/// // Increase limit for large dataset processing
31/// let config = FromCsvConfig {
32///     max_rows: 10_000_000, // 10 million rows
33///     ..Default::default()
34/// };
35/// ```
36pub const DEFAULT_MAX_ROWS: usize = 1_000_000;
37
38/// Default maximum number of columns to prevent column bomb attacks.
39///
40/// This limit prevents Denial-of-Service attacks from CSV files with excessive columns.
41/// The default is 10,000 columns, which is generous but prevents abuse.
42///
43/// # Security Considerations
44///
45/// - **Column bomb**: Without a limit, attackers could provide CSV files with
46///   hundreds of thousands of columns, causing memory exhaustion and slow processing.
47/// - **Industry standards**: Excel limits to 16,384 columns, Google Sheets to 18,278.
48/// - **Trade-off**: Higher limits allow wider datasets but increase `DoS` risk.
49pub const DEFAULT_MAX_COLUMNS: usize = 10_000;
50
51/// Default maximum cell size in bytes to prevent cell bomb attacks.
52///
53/// This limit prevents Denial-of-Service attacks from CSV files with enormous cells.
54/// The default is 1MB per cell, which is reasonable for most legitimate use cases.
55///
56/// # Security Considerations
57///
58/// - **Cell bomb**: Without a limit, attackers could provide CSV files with
59///   gigabyte-sized cells, causing memory exhaustion.
60/// - **Cumulative effect**: Multiple large cells multiply the impact.
61/// - **Trade-off**: Higher limits allow larger text fields but increase `DoS` risk.
62pub const DEFAULT_MAX_CELL_SIZE: usize = 1_048_576; // 1MB
63
64/// Default maximum total CSV size in bytes to prevent decompression bombs.
65///
66/// This limit prevents Denial-of-Service attacks from compressed CSV files that
67/// decompress to enormous sizes. The default is 100MB.
68///
69/// # Security Considerations
70///
71/// - **Decompression bomb**: A 1MB gzipped file could decompress to 1GB+.
72/// - **Memory exhaustion**: Prevents attackers from filling server memory.
73/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
74pub const DEFAULT_MAX_TOTAL_SIZE: usize = 104_857_600; // 100MB
75
76/// Default maximum header size in bytes to prevent header bombs.
77///
78/// This limit prevents Denial-of-Service attacks from CSV files with enormous headers.
79/// The default is 1MB for the total header size.
80///
81/// # Security Considerations
82///
83/// - **Header bomb**: Prevents attackers from using huge column names.
84/// - **Per-column**: Also enforced per-column via `max_cell_size`.
85/// - **Trade-off**: Higher limits allow longer column names but increase `DoS` risk.
86pub const DEFAULT_MAX_HEADER_SIZE: usize = 1_048_576; // 1MB
87
88/// Configuration for CSV parsing.
89///
90/// This structure controls all aspects of CSV parsing behavior, including delimiters,
91/// headers, whitespace handling, security limits, and custom list naming.
92///
93/// # Examples
94///
95/// ## Default Configuration
96///
97/// ```
98/// # use hedl_csv::FromCsvConfig;
99/// let config = FromCsvConfig::default();
100/// assert_eq!(config.delimiter, b',');
101/// assert!(config.has_headers);
102/// assert!(config.trim);
103/// assert_eq!(config.max_rows, 1_000_000);
104/// assert_eq!(config.list_key, None);
105/// ```
106///
107/// ## Tab-Delimited without Headers
108///
109/// ```
110/// # use hedl_csv::FromCsvConfig;
111/// let config = FromCsvConfig {
112///     delimiter: b'\t',
113///     has_headers: false,
114///     ..Default::default()
115/// };
116/// ```
117///
118/// ## Custom Row Limit for Large Datasets
119///
120/// ```
121/// # use hedl_csv::FromCsvConfig;
122/// let config = FromCsvConfig {
123///     max_rows: 10_000_000, // Allow up to 10M rows
124///     ..Default::default()
125/// };
126/// ```
127///
128/// ## Disable Whitespace Trimming
129///
130/// ```
131/// # use hedl_csv::FromCsvConfig;
132/// let config = FromCsvConfig {
133///     trim: false,
134///     ..Default::default()
135/// };
136/// ```
137///
138/// ## Enable Schema Inference
139///
140/// ```
141/// # use hedl_csv::FromCsvConfig;
142/// let config = FromCsvConfig {
143///     infer_schema: true,
144///     sample_rows: 200, // Sample first 200 rows
145///     ..Default::default()
146/// };
147/// ```
148///
149/// ## Custom List Key for Irregular Plurals
150///
151/// ```
152/// # use hedl_csv::FromCsvConfig;
153/// // For "Person" type, use "people" instead of default "persons"
154/// let config = FromCsvConfig {
155///     list_key: Some("people".to_string()),
156///     ..Default::default()
157/// };
158/// ```
159#[derive(Debug, Clone)]
160pub struct FromCsvConfig {
161    /// Field delimiter character (default: `,`).
162    ///
163    /// Common alternatives:
164    /// - `b'\t'` - Tab-separated values (TSV)
165    /// - `b';'` - Semicolon-separated (common in European locales)
166    /// - `b'|'` - Pipe-separated
167    pub delimiter: u8,
168
169    /// Whether the first row contains column headers (default: `true`).
170    ///
171    /// When `true`, the first row is interpreted as column names and not included
172    /// in the data. When `false`, all rows are treated as data.
173    pub has_headers: bool,
174
175    /// Whether to trim leading/trailing whitespace from fields (default: `true`).
176    ///
177    /// When `true`, fields like `"  value  "` become `"value"`. This is generally
178    /// recommended to handle inconsistently formatted CSV files.
179    pub trim: bool,
180
181    /// Maximum number of rows to parse (default: 1,000,000).
182    ///
183    /// This security limit prevents memory exhaustion from maliciously large CSV files.
184    /// Processing stops with an error if more rows are encountered.
185    ///
186    /// # Security Impact
187    ///
188    /// - **`DoS` Protection**: Prevents attackers from causing memory exhaustion
189    /// - **Memory Bound**: Limits worst-case memory usage to approximately
190    ///   `max_rows × avg_row_size × columns`
191    /// - **Recommended Values**:
192    ///   - Small deployments: 100,000 - 1,000,000 rows
193    ///   - Large deployments: 1,000,000 - 10,000,000 rows
194    ///   - Batch processing: Adjust based on available RAM
195    ///
196    /// # Example
197    ///
198    /// ```
199    /// # use hedl_csv::FromCsvConfig;
200    /// // For processing very large datasets on a high-memory server
201    /// let config = FromCsvConfig {
202    ///     max_rows: 50_000_000,
203    ///     ..Default::default()
204    /// };
205    /// ```
206    pub max_rows: usize,
207
208    /// Whether to automatically infer column types from data (default: `false`).
209    ///
210    /// When `true`, the parser samples the first `sample_rows` to determine the
211    /// most specific type for each column. When `false`, uses standard per-value
212    /// type inference.
213    ///
214    /// # Type Inference Hierarchy (most to least specific)
215    ///
216    /// 1. **Null**: All values are empty/null
217    /// 2. **Bool**: All values are "true" or "false"
218    /// 3. **Int**: All values parse as integers
219    /// 4. **Float**: All values parse as floats
220    /// 5. **String**: Fallback for all other cases
221    ///
222    /// # Example
223    ///
224    /// ```
225    /// # use hedl_csv::FromCsvConfig;
226    /// let config = FromCsvConfig {
227    ///     infer_schema: true,
228    ///     sample_rows: 100,
229    ///     ..Default::default()
230    /// };
231    /// ```
232    pub infer_schema: bool,
233
234    /// Number of rows to sample for schema inference (default: 100).
235    ///
236    /// Only used when `infer_schema` is `true`. Larger sample sizes provide
237    /// more accurate type detection but slower initial processing.
238    ///
239    /// # Trade-offs
240    ///
241    /// - **Small (10-50)**: Fast inference, may miss edge cases
242    /// - **Medium (100-500)**: Balanced accuracy and performance
243    /// - **Large (1000+)**: High accuracy, slower for large datasets
244    pub sample_rows: usize,
245
246    /// Custom key name for the matrix list in the document (default: `None`).
247    ///
248    /// When `None`, the list key is automatically generated by adding 's' to the
249    /// lowercased type name (e.g., "Person" → "persons"). When `Some`, uses the
250    /// specified custom key instead.
251    ///
252    /// # Use Cases
253    ///
254    /// - **Irregular Plurals**: "Person" → "people" instead of "persons"
255    /// - **Collective Nouns**: "Data" → "dataset" instead of "datas"
256    /// - **Custom Naming**: Any non-standard naming convention
257    /// - **Case-Sensitive Keys**: Preserve specific casing requirements
258    ///
259    /// # Examples
260    ///
261    /// ## Irregular Plural
262    ///
263    /// ```
264    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
265    /// let csv = "id,name\n1,Alice\n";
266    /// let config = FromCsvConfig {
267    ///     list_key: Some("people".to_string()),
268    ///     ..Default::default()
269    /// };
270    /// let doc = from_csv_with_config(csv, "Person", &["name"], config).unwrap();
271    /// assert!(doc.get("people").is_some()); // Uses custom plural
272    /// assert!(doc.get("persons").is_none()); // Default plural not used
273    /// ```
274    ///
275    /// ## Collective Noun
276    ///
277    /// ```
278    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
279    /// let csv = "id,value\n1,42\n";
280    /// let config = FromCsvConfig {
281    ///     list_key: Some("dataset".to_string()),
282    ///     ..Default::default()
283    /// };
284    /// let doc = from_csv_with_config(csv, "Data", &["value"], config).unwrap();
285    /// assert!(doc.get("dataset").is_some());
286    /// ```
287    ///
288    /// ## Case-Sensitive Key
289    ///
290    /// ```
291    /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
292    /// let csv = "id,value\n1,test\n";
293    /// let config = FromCsvConfig {
294    ///     list_key: Some("MyCustomList".to_string()),
295    ///     ..Default::default()
296    /// };
297    /// let doc = from_csv_with_config(csv, "Item", &["value"], config).unwrap();
298    /// assert!(doc.get("MyCustomList").is_some());
299    /// ```
300    pub list_key: Option<String>,
301
302    /// Maximum number of columns allowed (default: 10,000).
303    ///
304    /// This security limit prevents "column bomb" attacks where malicious CSV files
305    /// contain excessive columns that cause memory exhaustion and slow processing.
306    ///
307    /// # Security Impact
308    ///
309    /// - **`DoS` Protection**: Prevents attackers from creating CSVs with 50,000+ columns
310    /// - **Memory Bound**: Limits worst-case memory usage for column metadata
311    /// - **Industry Comparison**: Excel (16,384), Google Sheets (18,278), `PostgreSQL` (~1,600)
312    /// - **Recommended Values**:
313    ///   - Web uploads: 1,000 - 10,000 columns
314    ///   - Internal processing: 10,000 - 50,000 columns
315    ///   - Scientific data: Adjust based on requirements
316    ///
317    /// # Example
318    ///
319    /// ```
320    /// # use hedl_csv::FromCsvConfig;
321    /// // For processing wide scientific datasets
322    /// let config = FromCsvConfig {
323    ///     max_columns: 50_000,
324    ///     ..Default::default()
325    /// };
326    /// ```
327    pub max_columns: usize,
328
329    /// Maximum size of a single cell in bytes (default: 1MB).
330    ///
331    /// This security limit prevents "cell bomb" attacks where malicious CSV files
332    /// contain enormous individual cells that cause memory exhaustion.
333    ///
334    /// # Security Impact
335    ///
336    /// - **`DoS` Protection**: Prevents attackers from using 10MB+ cells
337    /// - **Memory Bound**: Each cell is read into memory as a String
338    /// - **Cumulative**: Multiple large cells multiply the impact
339    /// - **Recommended Values**:
340    ///   - Web uploads: 64KB - 1MB
341    ///   - Internal processing: 1MB - 10MB
342    ///   - Text-heavy data: Adjust based on requirements
343    ///
344    /// # Example
345    ///
346    /// ```
347    /// # use hedl_csv::FromCsvConfig;
348    /// // For processing long text fields (e.g., descriptions, comments)
349    /// let config = FromCsvConfig {
350    ///     max_cell_size: 5_242_880, // 5MB
351    ///     ..Default::default()
352    /// };
353    /// ```
354    pub max_cell_size: usize,
355
356    /// Maximum total CSV size in bytes after decompression (default: 100MB).
357    ///
358    /// This security limit prevents "decompression bomb" attacks where compressed
359    /// CSV files decompress to enormous sizes. A 1MB gzipped file could decompress
360    /// to 1GB+, bypassing file size checks.
361    ///
362    /// # Security Impact
363    ///
364    /// - **`DoS` Protection**: Prevents decompression bombs
365    /// - **Memory Bound**: Tracks total bytes read during parsing
366    /// - **Transparent**: Works even if CSV library handles decompression
367    /// - **Recommended Values**:
368    ///   - Web uploads: 10MB - 100MB
369    ///   - Internal processing: 100MB - 1GB
370    ///   - Big data: Adjust based on available RAM
371    ///
372    /// # Example
373    ///
374    /// ```
375    /// # use hedl_csv::FromCsvConfig;
376    /// // For processing large datasets on high-memory servers
377    /// let config = FromCsvConfig {
378    ///     max_total_size: 1_073_741_824, // 1GB
379    ///     ..Default::default()
380    /// };
381    /// ```
382    pub max_total_size: usize,
383
384    /// Maximum size of header row in bytes (default: 1MB).
385    ///
386    /// This security limit prevents "header bomb" attacks where malicious CSV files
387    /// have enormous column names or excessive total header size.
388    ///
389    /// # Security Impact
390    ///
391    /// - **`DoS` Protection**: Prevents huge column names (e.g., 1MB per column)
392    /// - **Memory Bound**: Limits memory for header parsing
393    /// - **Combined with `max_columns`**: Total size = `column_count` × `avg_name_length`
394    /// - **Recommended Values**:
395    ///   - Web uploads: 64KB - 1MB
396    ///   - Internal processing: 1MB - 10MB
397    ///   - Verbose column naming: Adjust based on requirements
398    ///
399    /// # Example
400    ///
401    /// ```
402    /// # use hedl_csv::FromCsvConfig;
403    /// // For datasets with very descriptive column names
404    /// let config = FromCsvConfig {
405    ///     max_header_size: 5_242_880, // 5MB
406    ///     ..Default::default()
407    /// };
408    /// ```
409    pub max_header_size: usize,
410}
411
412impl Default for FromCsvConfig {
413    fn default() -> Self {
414        Self {
415            delimiter: b',',
416            has_headers: true,
417            trim: true,
418            max_rows: DEFAULT_MAX_ROWS,
419            infer_schema: false,
420            sample_rows: 100,
421            list_key: None,
422            max_columns: DEFAULT_MAX_COLUMNS,
423            max_cell_size: DEFAULT_MAX_CELL_SIZE,
424            max_total_size: DEFAULT_MAX_TOTAL_SIZE,
425            max_header_size: DEFAULT_MAX_HEADER_SIZE,
426        }
427    }
428}
429
430impl FromCsvConfig {
431    /// Creates a config with NO security limits (use for trusted input only).
432    ///
433    /// # Security Warning
434    ///
435    /// This configuration disables ALL security limits. Only use this for:
436    /// - Trusted internal data sources
437    /// - Controlled batch processing environments
438    /// - Known-good CSV files
439    ///
440    /// **DO NOT** use this for:
441    /// - User uploads
442    /// - Web service inputs
443    /// - Untrusted data sources
444    ///
445    /// # Examples
446    ///
447    /// ```
448    /// # use hedl_csv::FromCsvConfig;
449    /// // For internal batch processing with trusted data
450    /// let config = FromCsvConfig::unlimited();
451    /// ```
452    #[must_use]
453    pub fn unlimited() -> Self {
454        Self {
455            max_rows: usize::MAX,
456            max_columns: usize::MAX,
457            max_cell_size: usize::MAX,
458            max_total_size: usize::MAX,
459            max_header_size: usize::MAX,
460            ..Default::default()
461        }
462    }
463
464    /// Creates a config with strict limits for untrusted input.
465    ///
466    /// # Security
467    ///
468    /// This configuration provides stricter limits suitable for:
469    /// - Web service uploads
470    /// - User-submitted CSV files
471    /// - Untrusted data sources
472    /// - Rate-limited APIs
473    ///
474    /// # Limits
475    ///
476    /// - `max_rows`: 1,000,000 (same as default)
477    /// - `max_columns`: 1,000 (stricter than default 10,000)
478    /// - `max_cell_size`: 64KB (stricter than default 1MB)
479    /// - `max_total_size`: 10MB (stricter than default 100MB)
480    /// - `max_header_size`: 64KB (stricter than default 1MB)
481    ///
482    /// # Examples
483    ///
484    /// ```
485    /// # use hedl_csv::FromCsvConfig;
486    /// // For user uploads in a web service
487    /// let config = FromCsvConfig::strict();
488    /// ```
489    #[must_use]
490    pub fn strict() -> Self {
491        Self {
492            max_rows: 1_000_000,
493            max_columns: 1_000,
494            max_cell_size: 65_536,
495            max_total_size: 10_485_760,
496            max_header_size: 65_536,
497            ..Default::default()
498        }
499    }
500}