hedl_csv/from_csv/conversion.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6
7//! Main CSV to HEDL conversion logic
8
9use crate::error::{CsvError, Result};
10use crate::from_csv::config::FromCsvConfig;
11use crate::from_csv::parsing::{parse_csv_value, parse_csv_value_with_type};
12use crate::from_csv::schema_inference::{infer_column_types, ColumnType};
13use crate::from_csv::validation::{validate_cell, validate_headers, CsvSizeTracker};
14use hedl_core::{Document, Item, MatrixList, Node};
15use std::io::Read;
16
17/// Parse CSV string into a HEDL document with default configuration.
18///
19/// This is a convenience wrapper around `from_csv_with_config` using default settings.
20pub fn from_csv(csv: &str, type_name: &str, schema: &[&str]) -> Result<Document> {
21 from_csv_with_config(csv, type_name, schema, FromCsvConfig::default())
22}
23
24/// Parse CSV string into a HEDL document with custom configuration.
25///
26/// This function provides full control over CSV parsing behavior through `FromCsvConfig`.
27///
28/// # Arguments
29///
30/// * `csv` - The CSV string to parse
31/// * `type_name` - The HEDL type name for rows
32/// * `schema` - Column names excluding the 'id' column
33/// * `config` - Configuration controlling delimiter, headers, trimming, and row limits
34///
35/// # Examples
36///
37/// ## Tab-Separated Values (TSV)
38///
39/// ```
40/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
41///
42/// let tsv_data = "id\tname\tage\n1\tAlice\t30";
43/// let config = FromCsvConfig {
44/// delimiter: b'\t',
45/// ..Default::default()
46/// };
47/// let doc = from_csv_with_config(tsv_data, "Person", &["name", "age"], config).unwrap();
48/// ```
49///
50/// ## Custom Row Limit
51///
52/// ```
53/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
54///
55/// let config = FromCsvConfig {
56/// max_rows: 10_000_000, // Allow up to 10M rows
57/// ..Default::default()
58/// };
59/// let csv_data = "id,value\n1,test";
60/// let doc = from_csv_with_config(csv_data, "Item", &["value"], config).unwrap();
61/// ```
62///
63/// ## Disable Whitespace Trimming
64///
65/// ```
66/// use hedl_csv::{from_csv_with_config, FromCsvConfig};
67/// use hedl_core::Value;
68///
69/// let csv_data = "id,name\n1, Alice ";
70/// let config = FromCsvConfig {
71/// trim: false,
72/// ..Default::default()
73/// };
74/// let doc = from_csv_with_config(csv_data, "Person", &["name"], config).unwrap();
75///
76/// let list = doc.get("persons").unwrap().as_list().unwrap();
77/// assert_eq!(list.rows[0].fields[1], Value::String(" Alice ".to_string().into()));
78/// ```
79///
80/// # See Also
81///
82/// - `from_csv` - Convenience function with default configuration
83/// - `from_csv_reader_with_config` - For streaming from files/network
84pub fn from_csv_with_config(
85 csv: &str,
86 type_name: &str,
87 schema: &[&str],
88 config: FromCsvConfig,
89) -> Result<Document> {
90 from_csv_reader_with_config(csv.as_bytes(), type_name, schema, config)
91}
92
93/// Parse CSV from a reader into a HEDL document with default configuration.
94///
95/// This function is useful for processing CSV files or network streams without
96/// loading the entire content into memory first.
97///
98/// # Arguments
99///
100/// * `reader` - Any type implementing `Read` (e.g., `File`, `TcpStream`, `&[u8]`)
101/// * `type_name` - The HEDL type name for rows
102/// * `schema` - Column names excluding the 'id' column
103///
104/// # Examples
105///
106/// ## Reading from a File
107///
108/// ```no_run
109/// use hedl_csv::from_csv_reader;
110/// use std::fs::File;
111///
112/// let file = File::open("data.csv").unwrap();
113/// let doc = from_csv_reader(file, "Person", &["name", "age"]).unwrap();
114/// ```
115///
116/// ## Reading from a Byte Slice
117///
118/// ```
119/// use hedl_csv::from_csv_reader;
120///
121/// let csv_bytes = b"id,name\n1,Alice";
122/// let doc = from_csv_reader(&csv_bytes[..], "Person", &["name"]).unwrap();
123/// ```
124///
125/// ## Reading from Standard Input
126///
127/// ```no_run
128/// use hedl_csv::from_csv_reader;
129/// use std::io;
130///
131/// let stdin = io::stdin();
132/// let doc = from_csv_reader(stdin.lock(), "Record", &["field1", "field2"]).unwrap();
133/// ```
134///
135/// # Performance
136///
137/// This function uses streaming I/O to minimize memory usage. The CSV data is
138/// processed row-by-row without buffering the entire file.
139///
140/// # See Also
141///
142/// - `from_csv_reader_with_config` - For custom delimiters and limits
143/// - `from_csv` - For parsing CSV strings
144pub fn from_csv_reader<R: Read>(reader: R, type_name: &str, schema: &[&str]) -> Result<Document> {
145 from_csv_reader_with_config(reader, type_name, schema, FromCsvConfig::default())
146}
147
148/// Converts a CSV file to a HEDL Document with custom configuration.
149///
150/// This is the main conversion function that handles CSV parsing with
151/// configurable options for delimiters, headers, trimming, and limits.
152pub fn from_csv_reader_with_config<R: Read>(
153 reader: R,
154 type_name: &str,
155 schema: &[&str],
156 config: FromCsvConfig,
157) -> Result<Document> {
158 let mut csv_reader = csv::ReaderBuilder::new()
159 .delimiter(config.delimiter)
160 .has_headers(config.has_headers)
161 .trim(if config.trim {
162 csv::Trim::All
163 } else {
164 csv::Trim::None
165 })
166 .from_reader(reader);
167
168 let mut doc = Document::new((2, 0));
169
170 // Create schema with 'id' column
171 let mut full_schema = vec!["id".to_string()];
172 full_schema.extend(schema.iter().map(|s| (*s).to_string()));
173
174 // Register the struct type
175 doc.structs
176 .insert(type_name.to_string(), full_schema.clone());
177
178 // Create matrix list
179 let mut matrix_list = MatrixList::new(type_name, full_schema.clone());
180
181 // VALIDATE HEADERS if has_headers is enabled
182 let headers = csv_reader.headers().map_err(|e| CsvError::ParseError {
183 line: 0,
184 message: e.to_string(),
185 })?;
186
187 validate_headers(headers, &config)?;
188
189 // Initialize size tracker
190 let mut size_tracker = CsvSizeTracker::new(config.max_total_size);
191
192 // Track header size
193 let header_size: usize = headers.iter().map(str::len).sum();
194 size_tracker.bytes_read += header_size;
195
196 // If schema inference is enabled, collect records first
197 let _inferred_types = if config.infer_schema {
198 // Collect records for sampling
199 let mut all_records = Vec::new();
200 for (record_idx, result) in csv_reader.records().enumerate() {
201 // Security: Limit row count to prevent memory exhaustion
202 if record_idx >= config.max_rows {
203 return Err(CsvError::SecurityLimit {
204 limit: config.max_rows,
205 actual: record_idx + 1,
206 });
207 }
208
209 let record = result.map_err(|e| CsvError::ParseError {
210 line: record_idx + 1,
211 message: e.to_string(),
212 })?;
213
214 if record.is_empty() {
215 continue;
216 }
217
218 // VALIDATE TOTAL SIZE
219 size_tracker.track_record(&record)?;
220
221 // VALIDATE EACH CELL
222 for (col_idx, cell) in record.iter().enumerate() {
223 validate_cell(cell, record_idx + 1, col_idx, &config)?;
224 }
225
226 // Convert StringRecord to Vec<String>
227 let row: Vec<String> = record
228 .iter()
229 .map(std::string::ToString::to_string)
230 .collect();
231 all_records.push(row);
232 }
233
234 // Infer column types from sampled records
235 let types = infer_column_types(&all_records, config.sample_rows);
236
237 // Process all records with inferred types
238 for (record_idx, row) in all_records.iter().enumerate() {
239 // First column is the ID
240 let id = row
241 .first()
242 .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
243
244 if id.is_empty() {
245 return Err(CsvError::EmptyId {
246 row: record_idx + 1,
247 });
248 }
249
250 // Parse ALL fields (including ID) with inferred types
251 let mut fields = Vec::new();
252 for (field_idx, field) in row.iter().enumerate() {
253 let col_type = types.get(field_idx).copied().unwrap_or(ColumnType::String);
254 let value = parse_csv_value_with_type(field, col_type).map_err(|e| {
255 e.with_context(format!(
256 "in column '{}' at line {}",
257 full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
258 record_idx + 1
259 ))
260 })?;
261 fields.push(value);
262 }
263
264 // Check field count matches full schema (including ID)
265 if fields.len() != full_schema.len() {
266 return Err(CsvError::WidthMismatch {
267 expected: full_schema.len(),
268 actual: fields.len(),
269 row: record_idx + 1,
270 });
271 }
272
273 let node = Node::new(type_name, id, fields);
274 matrix_list.add_row(node);
275 }
276
277 types
278 } else {
279 // Standard parsing without schema inference
280 for (record_idx, result) in csv_reader.records().enumerate() {
281 // Security: Limit row count to prevent memory exhaustion
282 if record_idx >= config.max_rows {
283 return Err(CsvError::SecurityLimit {
284 limit: config.max_rows,
285 actual: record_idx + 1,
286 });
287 }
288
289 let record = result.map_err(|e| CsvError::ParseError {
290 line: record_idx + 1,
291 message: e.to_string(),
292 })?;
293
294 if record.is_empty() {
295 continue;
296 }
297
298 // VALIDATE TOTAL SIZE
299 size_tracker.track_record(&record)?;
300
301 // VALIDATE EACH CELL
302 for (col_idx, cell) in record.iter().enumerate() {
303 validate_cell(cell, record_idx + 1, col_idx, &config)?;
304 }
305
306 // First column is the ID
307 let id = record
308 .get(0)
309 .ok_or_else(|| CsvError::MissingColumn("id".to_string()))?;
310
311 if id.is_empty() {
312 return Err(CsvError::EmptyId {
313 row: record_idx + 1,
314 });
315 }
316
317 // Parse ALL fields (including ID) per SPEC
318 let mut fields = Vec::new();
319 for (field_idx, field) in record.iter().enumerate() {
320 let value = parse_csv_value(field).map_err(|e| {
321 e.with_context(format!(
322 "in column '{}' at line {}",
323 full_schema.get(field_idx).unwrap_or(&"unknown".to_string()),
324 record_idx + 1
325 ))
326 })?;
327 fields.push(value);
328 }
329
330 // Check field count matches full schema (including ID)
331 if fields.len() != full_schema.len() {
332 return Err(CsvError::WidthMismatch {
333 expected: full_schema.len(),
334 actual: fields.len(),
335 row: record_idx + 1,
336 });
337 }
338
339 let node = Node::new(type_name, id, fields);
340 matrix_list.add_row(node);
341 }
342
343 Vec::new()
344 };
345
346 // Add matrix list to document with custom or default key
347 let list_key = config
348 .list_key
349 .unwrap_or_else(|| format!("{}s", type_name.to_lowercase()));
350
351 doc.root.insert(list_key, Item::List(matrix_list));
352
353 Ok(doc)
354}
355
356// Note: parse_csv_value function is in parsing.rs module