Skip to main content

hedl_csv/from_csv/
conversion.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6
7//! Main CSV to HEDL conversion logic
8
9use crate::error::{CsvError, Result};
10use crate::from_csv::config::FromCsvConfig;
11use crate::from_csv::parsing::{parse_csv_value, parse_csv_value_with_type};
12use crate::from_csv::schema_inference::{infer_column_types, ColumnType};
13use crate::from_csv::validation::{validate_cell, validate_headers, CsvSizeTracker};
14use hedl_core::{Document, Item, MatrixList, Node};
15use std::io::Read;
16
17/// Parse CSV string into a HEDL document with default configuration.
18///
19/// This is a convenience wrapper around `from_csv_with_config` using default settings.
20pub fn from_csv(csv: &str, type_name: &str, schema: &[&str]) -> Result<Document> {
21    from_csv_with_config(csv, type_name, schema, FromCsvConfig::default())
22}
23
24/// Parse CSV string into a HEDL document with custom configuration.
25///
26/// This function provides full control over CSV parsing behavior through `FromCsvConfig`.
27///
28/// # Arguments
29///
30/// * `csv` - The CSV string to parse
31/// * `type_name` - The HEDL type name for rows
32/// * `schema` - Column names excluding the 'id' column
33/// * `config` - Configuration controlling delimiter, headers, trimming, and row limits
34///
35/// # Examples
36///
37/// ## Tab-Separated Values (TSV)
38///
39/// ```
40/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
41///
42/// let tsv_data = "id\tname\tage\n1\tAlice\t30";
43/// let config = FromCsvConfig {
44///     delimiter: b'\t',
45///     ..Default::default()
46/// };
47/// let doc = from_csv_with_config(tsv_data, "Person", &["name", "age"], config).unwrap();
48/// ```
49///
50/// ## Custom Row Limit
51///
52/// ```
53/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
54///
55/// let config = FromCsvConfig {
56///     max_rows: 10_000_000, // Allow up to 10M rows
57///     ..Default::default()
58/// };
59/// let csv_data = "id,value\n1,test";
60/// let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
61/// ```
62///
63/// ## Disable Whitespace Trimming
64///
65/// ```
66/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
67/// use hedl_core::Value;
68///
69/// let csv_data = "id,name\n1,  Alice  ";
70/// let config = FromCsvConfig {
71///     trim: false,
72///     ..Default::default()
73/// };
74/// let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
75///
76/// let list = doc.get("persons").unwrap().as_list().unwrap();
77/// assert_eq!(list.rows[0].fields[1], Value::String("  Alice  ".to_string().into()));
78/// ```
79///
80/// # See Also
81///
82/// - `from_csv` - Convenience function with default configuration
83/// - `from_csv_reader_with_config` - For streaming from files/network
84pub fn from_csv_with_config(
85    csv: &str,
86    type_name: &str,
87    schema: &[&str],
88    config: FromCsvConfig,
89) -> Result<Document> {
90    from_csv_reader_with_config(csv.as_bytes(), type_name, schema, config)
91}
92
93/// Parse CSV from a reader into a HEDL document with default configuration.
94///
95/// This function is useful for processing CSV files or network streams without
96/// loading the entire content into memory first.
97///
98/// # Arguments
99///
100/// * `reader` - Any type implementing `Read` (e.g., `File`, `TcpStream`, `&[u8]`)
101/// * `type_name` - The HEDL type name for rows
102/// * `schema` - Column names excluding the 'id' column
103///
104/// # Examples
105///
106/// ## Reading from a File
107///
108/// ```no_run
109/// use hedl_csv::from_csv_reader;
110/// use std::fs::File;
111///
112/// let file = File::open("data.csv").unwrap();
113/// let doc = from_csv_reader(file, "Person", &["name", "age"]).unwrap();
114/// ```
115///
116/// ## Reading from a Byte Slice
117///
118/// ```
119/// use hedl_csv::from_csv_reader;
120///
121/// let csv_bytes = b"id,name\n1,Alice";
122/// let doc = from_csv_reader(&csv_bytes[..], "Person", &["name"]).unwrap();
123/// ```
124///
125/// ## Reading from Standard Input
126///
127/// ```no_run
128/// use hedl_csv::from_csv_reader;
129/// use std::io;
130///
131/// let stdin = io::stdin();
132/// let doc = from_csv_reader(stdin.lock(), "Record", &["field1", "field2"]).unwrap();
133/// ```
134///
135/// # Performance
136///
137/// This function uses streaming I/O to minimize memory usage. The CSV data is
138/// processed row-by-row without buffering the entire file.
139///
140/// # See Also
141///
142/// - `from_csv_reader_with_config` - For custom delimiters and limits
143/// - `from_csv` - For parsing CSV strings
144pub fn from_csv_reader<R: Read>(reader: R, type_name: &str, schema: &[&str]) -> Result<Document> {
145    from_csv_reader_with_config(reader, type_name, schema, FromCsvConfig::default())
146}
147
148/// Converts a CSV file to a HEDL Document with custom configuration.
149///
150/// This is the main conversion function that handles CSV parsing with
151/// configurable options for delimiters, headers, trimming, and limits.
152pub fn from_csv_reader_with_config<R: Read>(
153    reader: R,
154    type_name: &str,
155    schema: &[&str],
156    config: FromCsvConfig,
157) -> Result<Document> {
158    let mut csv_reader = csv::ReaderBuilder::new()
159        .delimiter(config.delimiter)
160        .has_headers(config.has_headers)
161        .trim(if config.trim {
162            csv::Trim::All
163        } else {
164            csv::Trim::None
165        })
166        .from_reader(reader);
167
168    let mut doc = Document::new((2, 0));
169
170    // Create schema with 'id' column
171    let mut full_schema = vec!["id".to_string()];
172    full_schema.extend(schema.iter().map(|s| (*s).to_string()));
173
174    // Register the struct type
175    doc.structs
176        .insert(type_name.to_string(), full_schema.clone());
177
178    // Create matrix list
179    let mut matrix_list = MatrixList::new(type_name, full_schema.clone());
180
181    // VALIDATE HEADERS if has_headers is enabled
182    let headers = csv_reader.headers().map_err(|e| CsvError::ParseError {
183        line: 0,
184        message: e.to_string(),
185    })?;
186
187    validate_headers(headers, &config)?;
188
189    // Initialize size tracker
190    let mut size_tracker = CsvSizeTracker::new(config.max_total_size);
191
192    // Track header size
193    let header_size: usize = headers.iter().map(str::len).sum();
194    size_tracker.bytes_read += header_size;
195
196    // If schema inference is enabled, collect records first
197    let _inferred_types = if config.infer_schema {
198        // Collect records for sampling
199        let mut all_records = Vec::new();
200        for (record_idx, result) in csv_reader.records().enumerate() {
201            // Security: Limit row count to prevent memory exhaustion
202            if record_idx >= config.max_rows {
203                return Err(CsvError::SecurityLimit {
204                    limit: config.max_rows,
205                    actual: record_idx + 1,
206                });
207            }
208
209            let record = result.map_err(|e| CsvError::ParseError {
210                line: record_idx + 1,
211                message: e.to_string(),
212            })?;
213
214            if record.is_empty() {
215                continue;
216            }
217
218            // VALIDATE TOTAL SIZE
219            size_tracker.track_record(&record)?;
220
221            // VALIDATE EACH CELL
222            for (col_idx, cell) in record.iter().enumerate() {
223                validate_cell(cell, record_idx + 1, col_idx, &config)?;
224            }
225
226            // Convert StringRecord to Vec<String>
227            let row: Vec<String> = record
228                .iter()
229                .map(std::string::ToString::to_string)
230                .collect();
231            all_records.push(row);
232        }
233
234        // Infer column types from sampled records
235        let types = infer_column_types(&all_records, config.sample_rows);
236
237        // Process all records with inferred types
238        for (record_idx, row) in all_records.iter().enumerate() {
239            // First column is the ID
240            let id = row
241                .first()
242                .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
243
244            if id.is_empty() {
245                return Err(CsvError::EmptyId {
246                    row: record_idx + 1,
247                });
248            }
249
250            // Parse ALL fields (including ID) with inferred types
251            let mut fields = Vec::new();
252            for (field_idx, field) in row.iter().enumerate() {
253                let col_type = types.get(field_idx).copied().unwrap_or(ColumnType::String);
254                let value = parse_csv_value_with_type(field, col_type).map_err(|e| {
255                    e.with_context(format!(
256                        "in column '{}' at line {}",
257                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
258                        record_idx + 1
259                    ))
260                })?;
261                fields.push(value);
262            }
263
264            // Check field count matches full schema (including ID)
265            if fields.len() != full_schema.len() {
266                return Err(CsvError::WidthMismatch {
267                    expected: full_schema.len(),
268                    actual: fields.len(),
269                    row: record_idx + 1,
270                });
271            }
272
273            let node = Node::new(type_name, id, fields);
274            matrix_list.add_row(node);
275        }
276
277        types
278    } else {
279        // Standard parsing without schema inference
280        for (record_idx, result) in csv_reader.records().enumerate() {
281            // Security: Limit row count to prevent memory exhaustion
282            if record_idx >= config.max_rows {
283                return Err(CsvError::SecurityLimit {
284                    limit: config.max_rows,
285                    actual: record_idx + 1,
286                });
287            }
288
289            let record = result.map_err(|e| CsvError::ParseError {
290                line: record_idx + 1,
291                message: e.to_string(),
292            })?;
293
294            if record.is_empty() {
295                continue;
296            }
297
298            // VALIDATE TOTAL SIZE
299            size_tracker.track_record(&record)?;
300
301            // VALIDATE EACH CELL
302            for (col_idx, cell) in record.iter().enumerate() {
303                validate_cell(cell, record_idx + 1, col_idx, &config)?;
304            }
305
306            // First column is the ID
307            let id = record
308                .get(0)
309                .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
310
311            if id.is_empty() {
312                return Err(CsvError::EmptyId {
313                    row: record_idx + 1,
314                });
315            }
316
317            // Parse ALL fields (including ID) per SPEC
318            let mut fields = Vec::new();
319            for (field_idx, field) in record.iter().enumerate() {
320                let value = parse_csv_value(field).map_err(|e| {
321                    e.with_context(format!(
322                        "in column '{}' at line {}",
323                        full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
324                        record_idx + 1
325                    ))
326                })?;
327                fields.push(value);
328            }
329
330            // Check field count matches full schema (including ID)
331            if fields.len() != full_schema.len() {
332                return Err(CsvError::WidthMismatch {
333                    expected: full_schema.len(),
334                    actual: fields.len(),
335                    row: record_idx + 1,
336                });
337            }
338
339            let node = Node::new(type_name, id, fields);
340            matrix_list.add_row(node);
341        }
342
343        Vec::new()
344    };
345
346    // Add matrix list to document with custom or default key
347    let list_key = config
348        .list_key
349        .unwrap_or_else(|| format!("{}s", type_name.to_lowercase()));
350
351    doc.root.insert(list_key, Item::List(matrix_list));
352
353    Ok(doc)
354}
355
356// Note: parse_csv_value function is in parsing.rs module