hedl_csv/from_csv/config.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6
7//! Configuration types for CSV import
8
9///
10/// This limit prevents Denial-of-Service attacks from maliciously large CSV files.
11/// The default is 1 million rows, which allows processing reasonably large datasets
12/// while preventing unbounded memory allocation.
13///
14/// # Security Considerations
15///
16/// - **Memory exhaustion**: Without a limit, attackers could provide CSV files with
17/// billions of rows, causing the application to allocate excessive memory and crash.
18/// - **Configurable**: The limit can be adjusted via `FromCsvConfig::max_rows` based on
19/// deployment context and available resources.
20/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
21///
22/// # Examples
23///
24/// ```
25/// # use hedl_csv::FromCsvConfig;
26/// // Use default 1M row limit
27/// let config = FromCsvConfig::default();
28/// assert_eq!(config.max_rows, 1_000_000);
29///
30/// // Increase limit for large dataset processing
31/// let config = FromCsvConfig {
32/// max_rows: 10_000_000, // 10 million rows
33/// ..Default::default()
34/// };
35/// ```
36pub const DEFAULT_MAX_ROWS: usize = 1_000_000;
37
38/// Default maximum number of columns to prevent column bomb attacks.
39///
40/// This limit prevents Denial-of-Service attacks from CSV files with excessive columns.
41/// The default is 10,000 columns, which is generous but prevents abuse.
42///
43/// # Security Considerations
44///
45/// - **Column bomb**: Without a limit, attackers could provide CSV files with
46/// hundreds of thousands of columns, causing memory exhaustion and slow processing.
47/// - **Industry standards**: Excel limits to 16,384 columns, Google Sheets to 18,278.
48/// - **Trade-off**: Higher limits allow wider datasets but increase `DoS` risk.
49pub const DEFAULT_MAX_COLUMNS: usize = 10_000;
50
51/// Default maximum cell size in bytes to prevent cell bomb attacks.
52///
53/// This limit prevents Denial-of-Service attacks from CSV files with enormous cells.
54/// The default is 1MB per cell, which is reasonable for most legitimate use cases.
55///
56/// # Security Considerations
57///
58/// - **Cell bomb**: Without a limit, attackers could provide CSV files with
59/// gigabyte-sized cells, causing memory exhaustion.
60/// - **Cumulative effect**: Multiple large cells multiply the impact.
61/// - **Trade-off**: Higher limits allow larger text fields but increase `DoS` risk.
62pub const DEFAULT_MAX_CELL_SIZE: usize = 1_048_576; // 1MB
63
64/// Default maximum total CSV size in bytes to prevent decompression bombs.
65///
66/// This limit prevents Denial-of-Service attacks from compressed CSV files that
67/// decompress to enormous sizes. The default is 100MB.
68///
69/// # Security Considerations
70///
71/// - **Decompression bomb**: A 1MB gzipped file could decompress to 1GB+.
72/// - **Memory exhaustion**: Prevents attackers from filling server memory.
73/// - **Trade-off**: Higher limits allow larger datasets but increase `DoS` risk.
74pub const DEFAULT_MAX_TOTAL_SIZE: usize = 104_857_600; // 100MB
75
76/// Default maximum header size in bytes to prevent header bombs.
77///
78/// This limit prevents Denial-of-Service attacks from CSV files with enormous headers.
79/// The default is 1MB for the total header size.
80///
81/// # Security Considerations
82///
83/// - **Header bomb**: Prevents attackers from using huge column names.
84/// - **Per-column**: Also enforced per-column via `max_cell_size`.
85/// - **Trade-off**: Higher limits allow longer column names but increase `DoS` risk.
86pub const DEFAULT_MAX_HEADER_SIZE: usize = 1_048_576; // 1MB
87
88/// Configuration for CSV parsing.
89///
90/// This structure controls all aspects of CSV parsing behavior, including delimiters,
91/// headers, whitespace handling, security limits, and custom list naming.
92///
93/// # Examples
94///
95/// ## Default Configuration
96///
97/// ```
98/// # use hedl_csv::FromCsvConfig;
99/// let config = FromCsvConfig::default();
100/// assert_eq!(config.delimiter, b',');
101/// assert!(config.has_headers);
102/// assert!(config.trim);
103/// assert_eq!(config.max_rows, 1_000_000);
104/// assert_eq!(config.list_key, None);
105/// ```
106///
107/// ## Tab-Delimited without Headers
108///
109/// ```
110/// # use hedl_csv::FromCsvConfig;
111/// let config = FromCsvConfig {
112/// delimiter: b'\t',
113/// has_headers: false,
114/// ..Default::default()
115/// };
116/// ```
117///
118/// ## Custom Row Limit for Large Datasets
119///
120/// ```
121/// # use hedl_csv::FromCsvConfig;
122/// let config = FromCsvConfig {
123/// max_rows: 10_000_000, // Allow up to 10M rows
124/// ..Default::default()
125/// };
126/// ```
127///
128/// ## Disable Whitespace Trimming
129///
130/// ```
131/// # use hedl_csv::FromCsvConfig;
132/// let config = FromCsvConfig {
133/// trim: false,
134/// ..Default::default()
135/// };
136/// ```
137///
138/// ## Enable Schema Inference
139///
140/// ```
141/// # use hedl_csv::FromCsvConfig;
142/// let config = FromCsvConfig {
143/// infer_schema: true,
144/// sample_rows: 200, // Sample first 200 rows
145/// ..Default::default()
146/// };
147/// ```
148///
149/// ## Custom List Key for Irregular Plurals
150///
151/// ```
152/// # use hedl_csv::FromCsvConfig;
153/// // For "Person" type, use "people" instead of default "persons"
154/// let config = FromCsvConfig {
155/// list_key: Some("people".to_string()),
156/// ..Default::default()
157/// };
158/// ```
159#[derive(Debug, Clone)]
160pub struct FromCsvConfig {
161 /// Field delimiter character (default: `,`).
162 ///
163 /// Common alternatives:
164 /// - `b'\t'` - Tab-separated values (TSV)
165 /// - `b';'` - Semicolon-separated (common in European locales)
166 /// - `b'|'` - Pipe-separated
167 pub delimiter: u8,
168
169 /// Whether the first row contains column headers (default: `true`).
170 ///
171 /// When `true`, the first row is interpreted as column names and not included
172 /// in the data. When `false`, all rows are treated as data.
173 pub has_headers: bool,
174
175 /// Whether to trim leading/trailing whitespace from fields (default: `true`).
176 ///
177 /// When `true`, fields like `" value "` become `"value"`. This is generally
178 /// recommended to handle inconsistently formatted CSV files.
179 pub trim: bool,
180
181 /// Maximum number of rows to parse (default: 1,000,000).
182 ///
183 /// This security limit prevents memory exhaustion from maliciously large CSV files.
184 /// Processing stops with an error if more rows are encountered.
185 ///
186 /// # Security Impact
187 ///
188 /// - **`DoS` Protection**: Prevents attackers from causing memory exhaustion
189 /// - **Memory Bound**: Limits worst-case memory usage to approximately
190 /// `max_rows × avg_row_size × columns`
191 /// - **Recommended Values**:
192 /// - Small deployments: 100,000 - 1,000,000 rows
193 /// - Large deployments: 1,000,000 - 10,000,000 rows
194 /// - Batch processing: Adjust based on available RAM
195 ///
196 /// # Example
197 ///
198 /// ```
199 /// # use hedl_csv::FromCsvConfig;
200 /// // For processing very large datasets on a high-memory server
201 /// let config = FromCsvConfig {
202 /// max_rows: 50_000_000,
203 /// ..Default::default()
204 /// };
205 /// ```
206 pub max_rows: usize,
207
208 /// Whether to automatically infer column types from data (default: `false`).
209 ///
210 /// When `true`, the parser samples the first `sample_rows` to determine the
211 /// most specific type for each column. When `false`, uses standard per-value
212 /// type inference.
213 ///
214 /// # Type Inference Hierarchy (most to least specific)
215 ///
216 /// 1. **Null**: All values are empty/null
217 /// 2. **Bool**: All values are "true" or "false"
218 /// 3. **Int**: All values parse as integers
219 /// 4. **Float**: All values parse as floats
220 /// 5. **String**: Fallback for all other cases
221 ///
222 /// # Example
223 ///
224 /// ```
225 /// # use hedl_csv::FromCsvConfig;
226 /// let config = FromCsvConfig {
227 /// infer_schema: true,
228 /// sample_rows: 100,
229 /// ..Default::default()
230 /// };
231 /// ```
232 pub infer_schema: bool,
233
234 /// Number of rows to sample for schema inference (default: 100).
235 ///
236 /// Only used when `infer_schema` is `true`. Larger sample sizes provide
237 /// more accurate type detection but slower initial processing.
238 ///
239 /// # Trade-offs
240 ///
241 /// - **Small (10-50)**: Fast inference, may miss edge cases
242 /// - **Medium (100-500)**: Balanced accuracy and performance
243 /// - **Large (1000+)**: High accuracy, slower for large datasets
244 pub sample_rows: usize,
245
246 /// Custom key name for the matrix list in the document (default: `None`).
247 ///
248 /// When `None`, the list key is automatically generated by adding 's' to the
249 /// lowercased type name (e.g., "Person" → "persons"). When `Some`, uses the
250 /// specified custom key instead.
251 ///
252 /// # Use Cases
253 ///
254 /// - **Irregular Plurals**: "Person" → "people" instead of "persons"
255 /// - **Collective Nouns**: "Data" → "dataset" instead of "datas"
256 /// - **Custom Naming**: Any non-standard naming convention
257 /// - **Case-Sensitive Keys**: Preserve specific casing requirements
258 ///
259 /// # Examples
260 ///
261 /// ## Irregular Plural
262 ///
263 /// ```
264 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
265 /// let csv = "id,name\n1,Alice\n";
266 /// let config = FromCsvConfig {
267 /// list_key: Some("people".to_string()),
268 /// ..Default::default()
269 /// };
270 /// let doc = from_csv_with_config(csv, "Person", &["name"], config).unwrap();
271 /// assert!(doc.get("people").is_some()); // Uses custom plural
272 /// assert!(doc.get("persons").is_none()); // Default plural not used
273 /// ```
274 ///
275 /// ## Collective Noun
276 ///
277 /// ```
278 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
279 /// let csv = "id,value\n1,42\n";
280 /// let config = FromCsvConfig {
281 /// list_key: Some("dataset".to_string()),
282 /// ..Default::default()
283 /// };
284 /// let doc = from_csv_with_config(csv, "Data", &["value"], config).unwrap();
285 /// assert!(doc.get("dataset").is_some());
286 /// ```
287 ///
288 /// ## Case-Sensitive Key
289 ///
290 /// ```
291 /// # use hedl_csv::{from_csv_with_config, FromCsvConfig};
292 /// let csv = "id,value\n1,test\n";
293 /// let config = FromCsvConfig {
294 /// list_key: Some("MyCustomList".to_string()),
295 /// ..Default::default()
296 /// };
297 /// let doc = from_csv_with_config(csv, "Item", &["value"], config).unwrap();
298 /// assert!(doc.get("MyCustomList").is_some());
299 /// ```
300 pub list_key: Option<String>,
301
302 /// Maximum number of columns allowed (default: 10,000).
303 ///
304 /// This security limit prevents "column bomb" attacks where malicious CSV files
305 /// contain excessive columns that cause memory exhaustion and slow processing.
306 ///
307 /// # Security Impact
308 ///
309 /// - **`DoS` Protection**: Prevents attackers from creating CSVs with 50,000+ columns
310 /// - **Memory Bound**: Limits worst-case memory usage for column metadata
311 /// - **Industry Comparison**: Excel (16,384), Google Sheets (18,278), `PostgreSQL` (~1,600)
312 /// - **Recommended Values**:
313 /// - Web uploads: 1,000 - 10,000 columns
314 /// - Internal processing: 10,000 - 50,000 columns
315 /// - Scientific data: Adjust based on requirements
316 ///
317 /// # Example
318 ///
319 /// ```
320 /// # use hedl_csv::FromCsvConfig;
321 /// // For processing wide scientific datasets
322 /// let config = FromCsvConfig {
323 /// max_columns: 50_000,
324 /// ..Default::default()
325 /// };
326 /// ```
327 pub max_columns: usize,
328
329 /// Maximum size of a single cell in bytes (default: 1MB).
330 ///
331 /// This security limit prevents "cell bomb" attacks where malicious CSV files
332 /// contain enormous individual cells that cause memory exhaustion.
333 ///
334 /// # Security Impact
335 ///
336 /// - **`DoS` Protection**: Prevents attackers from using 10MB+ cells
337 /// - **Memory Bound**: Each cell is read into memory as a String
338 /// - **Cumulative**: Multiple large cells multiply the impact
339 /// - **Recommended Values**:
340 /// - Web uploads: 64KB - 1MB
341 /// - Internal processing: 1MB - 10MB
342 /// - Text-heavy data: Adjust based on requirements
343 ///
344 /// # Example
345 ///
346 /// ```
347 /// # use hedl_csv::FromCsvConfig;
348 /// // For processing long text fields (e.g., descriptions, comments)
349 /// let config = FromCsvConfig {
350 /// max_cell_size: 5_242_880, // 5MB
351 /// ..Default::default()
352 /// };
353 /// ```
354 pub max_cell_size: usize,
355
356 /// Maximum total CSV size in bytes after decompression (default: 100MB).
357 ///
358 /// This security limit prevents "decompression bomb" attacks where compressed
359 /// CSV files decompress to enormous sizes. A 1MB gzipped file could decompress
360 /// to 1GB+, bypassing file size checks.
361 ///
362 /// # Security Impact
363 ///
364 /// - **`DoS` Protection**: Prevents decompression bombs
365 /// - **Memory Bound**: Tracks total bytes read during parsing
366 /// - **Transparent**: Works even if CSV library handles decompression
367 /// - **Recommended Values**:
368 /// - Web uploads: 10MB - 100MB
369 /// - Internal processing: 100MB - 1GB
370 /// - Big data: Adjust based on available RAM
371 ///
372 /// # Example
373 ///
374 /// ```
375 /// # use hedl_csv::FromCsvConfig;
376 /// // For processing large datasets on high-memory servers
377 /// let config = FromCsvConfig {
378 /// max_total_size: 1_073_741_824, // 1GB
379 /// ..Default::default()
380 /// };
381 /// ```
382 pub max_total_size: usize,
383
384 /// Maximum size of header row in bytes (default: 1MB).
385 ///
386 /// This security limit prevents "header bomb" attacks where malicious CSV files
387 /// have enormous column names or excessive total header size.
388 ///
389 /// # Security Impact
390 ///
391 /// - **`DoS` Protection**: Prevents huge column names (e.g., 1MB per column)
392 /// - **Memory Bound**: Limits memory for header parsing
393 /// - **Combined with `max_columns`**: Total size = `column_count` × `avg_name_length`
394 /// - **Recommended Values**:
395 /// - Web uploads: 64KB - 1MB
396 /// - Internal processing: 1MB - 10MB
397 /// - Verbose column naming: Adjust based on requirements
398 ///
399 /// # Example
400 ///
401 /// ```
402 /// # use hedl_csv::FromCsvConfig;
403 /// // For datasets with very descriptive column names
404 /// let config = FromCsvConfig {
405 /// max_header_size: 5_242_880, // 5MB
406 /// ..Default::default()
407 /// };
408 /// ```
409 pub max_header_size: usize,
410}
411
412impl Default for FromCsvConfig {
413 fn default() -> Self {
414 Self {
415 delimiter: b',',
416 has_headers: true,
417 trim: true,
418 max_rows: DEFAULT_MAX_ROWS,
419 infer_schema: false,
420 sample_rows: 100,
421 list_key: None,
422 max_columns: DEFAULT_MAX_COLUMNS,
423 max_cell_size: DEFAULT_MAX_CELL_SIZE,
424 max_total_size: DEFAULT_MAX_TOTAL_SIZE,
425 max_header_size: DEFAULT_MAX_HEADER_SIZE,
426 }
427 }
428}
429
430impl FromCsvConfig {
431 /// Creates a config with NO security limits (use for trusted input only).
432 ///
433 /// # Security Warning
434 ///
435 /// This configuration disables ALL security limits. Only use this for:
436 /// - Trusted internal data sources
437 /// - Controlled batch processing environments
438 /// - Known-good CSV files
439 ///
440 /// **DO NOT** use this for:
441 /// - User uploads
442 /// - Web service inputs
443 /// - Untrusted data sources
444 ///
445 /// # Examples
446 ///
447 /// ```
448 /// # use hedl_csv::FromCsvConfig;
449 /// // For internal batch processing with trusted data
450 /// let config = FromCsvConfig::unlimited();
451 /// ```
452 #[must_use]
453 pub fn unlimited() -> Self {
454 Self {
455 max_rows: usize::MAX,
456 max_columns: usize::MAX,
457 max_cell_size: usize::MAX,
458 max_total_size: usize::MAX,
459 max_header_size: usize::MAX,
460 ..Default::default()
461 }
462 }
463
464 /// Creates a config with strict limits for untrusted input.
465 ///
466 /// # Security
467 ///
468 /// This configuration provides stricter limits suitable for:
469 /// - Web service uploads
470 /// - User-submitted CSV files
471 /// - Untrusted data sources
472 /// - Rate-limited APIs
473 ///
474 /// # Limits
475 ///
476 /// - `max_rows`: 1,000,000 (same as default)
477 /// - `max_columns`: 1,000 (stricter than default 10,000)
478 /// - `max_cell_size`: 64KB (stricter than default 1MB)
479 /// - `max_total_size`: 10MB (stricter than default 100MB)
480 /// - `max_header_size`: 64KB (stricter than default 1MB)
481 ///
482 /// # Examples
483 ///
484 /// ```
485 /// # use hedl_csv::FromCsvConfig;
486 /// // For user uploads in a web service
487 /// let config = FromCsvConfig::strict();
488 /// ```
489 #[must_use]
490 pub fn strict() -> Self {
491 Self {
492 max_rows: 1_000_000,
493 max_columns: 1_000,
494 max_cell_size: 65_536,
495 max_total_size: 10_485_760,
496 max_header_size: 65_536,
497 ..Default::default()
498 }
499 }
500}