sql_cli/data/
datatable_loaders.rs

1use crate::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use anyhow::{Context, Result};
3use csv::ReaderBuilder;
4use serde_json::Value as JsonValue;
5use std::collections::HashSet;
6use std::fs::File;
7use std::io::{BufRead, BufReader, Read};
8use std::path::Path;
9
10/// Helper to detect if a field in the raw CSV line is a null (unquoted empty)
11fn is_null_field(raw_line: &str, field_index: usize) -> bool {
12    let mut comma_count = 0;
13    let mut in_quotes = false;
14    let mut field_start = 0;
15    let mut prev_char = ' ';
16
17    for (i, ch) in raw_line.chars().enumerate() {
18        if ch == '"' && prev_char != '\\' {
19            in_quotes = !in_quotes;
20        }
21
22        if ch == ',' && !in_quotes {
23            if comma_count == field_index {
24                let field_end = i;
25                let field_content = &raw_line[field_start..field_end].trim();
26                // If empty, check if it was quoted (quoted empty = empty string, unquoted empty = NULL)
27                if field_content.is_empty() {
28                    return true; // Unquoted empty field -> NULL
29                }
30                // If it starts and ends with quotes but is empty inside, it's an empty string, not NULL
31                if field_content.starts_with('"')
32                    && field_content.ends_with('"')
33                    && field_content.len() == 2
34                {
35                    return false; // Quoted empty field -> empty string
36                }
37                return false; // Non-empty field -> not NULL
38            }
39            comma_count += 1;
40            field_start = i + 1;
41        }
42        prev_char = ch;
43    }
44
45    // Check last field
46    if comma_count == field_index {
47        let field_content = raw_line[field_start..]
48            .trim()
49            .trim_end_matches('\n')
50            .trim_end_matches('\r');
51        // If empty, check if it was quoted
52        if field_content.is_empty() {
53            return true; // Unquoted empty field -> NULL
54        }
55        // If it starts and ends with quotes but is empty inside, it's an empty string, not NULL
56        if field_content.starts_with('"')
57            && field_content.ends_with('"')
58            && field_content.len() == 2
59        {
60            return false; // Quoted empty field -> empty string
61        }
62        return false; // Non-empty field -> not NULL
63    }
64
65    false // Field not found -> not NULL (shouldn't happen)
66}
67
68/// Load a CSV file into a `DataTable`
69pub fn load_csv_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
70    let file = File::open(&path)
71        .with_context(|| format!("Failed to open CSV file: {:?}", path.as_ref()))?;
72
73    let mut reader = ReaderBuilder::new().has_headers(true).from_reader(file);
74
75    // Get headers and create columns
76    let headers = reader.headers()?.clone();
77    let mut table = DataTable::new(table_name);
78
79    // Add metadata about the source
80    table
81        .metadata
82        .insert("source_type".to_string(), "csv".to_string());
83    table.metadata.insert(
84        "source_path".to_string(),
85        path.as_ref().display().to_string(),
86    );
87
88    // Create columns from headers (types will be inferred later)
89    for header in &headers {
90        table.add_column(DataColumn::new(header));
91    }
92
93    // Open a second file handle for raw line reading
94    let file2 = File::open(&path).with_context(|| {
95        format!(
96            "Failed to open CSV file for raw reading: {:?}",
97            path.as_ref()
98        )
99    })?;
100    let mut line_reader = BufReader::new(file2);
101    let mut raw_line = String::new();
102    // Skip header line
103    line_reader.read_line(&mut raw_line)?;
104
105    // Read all rows first to collect data
106    let mut string_rows = Vec::new();
107    let mut raw_lines = Vec::new();
108
109    for result in reader.records() {
110        let record = result?;
111        let row: Vec<String> = record
112            .iter()
113            .map(std::string::ToString::to_string)
114            .collect();
115
116        // Read the corresponding raw line
117        raw_line.clear();
118        line_reader.read_line(&mut raw_line)?;
119        raw_lines.push(raw_line.clone());
120
121        string_rows.push(row);
122    }
123
124    // Infer column types by sampling the data
125    let mut column_types = vec![DataType::Null; headers.len()];
126    let sample_size = string_rows.len().min(100); // Sample first 100 rows for type inference
127
128    for row in string_rows.iter().take(sample_size) {
129        for (col_idx, value) in row.iter().enumerate() {
130            if !value.is_empty() {
131                let inferred = DataType::infer_from_string(value);
132                column_types[col_idx] = column_types[col_idx].merge(&inferred);
133            }
134        }
135    }
136
137    // Update column types
138    for (col_idx, column) in table.columns.iter_mut().enumerate() {
139        column.data_type = column_types[col_idx].clone();
140    }
141
142    // Convert string data to typed DataValues and add rows
143    for (row_idx, string_row) in string_rows.iter().enumerate() {
144        let mut values = Vec::new();
145        let raw_line = &raw_lines[row_idx];
146
147        for (col_idx, value) in string_row.iter().enumerate() {
148            let data_value = if value.is_empty() {
149                // Distinguish between NULL (,,) and empty string ("")
150                if is_null_field(raw_line, col_idx) {
151                    DataValue::Null
152                } else {
153                    DataValue::String(String::new())
154                }
155            } else {
156                DataValue::from_string(value, &column_types[col_idx])
157            };
158            values.push(data_value);
159        }
160        table
161            .add_row(DataRow::new(values))
162            .map_err(|e| anyhow::anyhow!(e))?;
163    }
164
165    // Update column statistics
166    table.infer_column_types();
167
168    Ok(table)
169}
170
171/// Load a JSON file into a `DataTable`
172pub fn load_json_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
173    // Read file as string first to preserve key order
174    let mut file = File::open(&path)
175        .with_context(|| format!("Failed to open JSON file: {:?}", path.as_ref()))?;
176    let mut json_str = String::new();
177    file.read_to_string(&mut json_str)?;
178
179    // Parse JSON while preserving order using serde_json's preserve_order feature
180    // For now, we'll use a workaround to get keys in original order
181    let json_data: Vec<JsonValue> =
182        serde_json::from_str(&json_str).with_context(|| "Failed to parse JSON file")?;
183
184    if json_data.is_empty() {
185        return Ok(DataTable::new(table_name));
186    }
187
188    // Extract column names from first object, preserving order
189    // Parse the first object manually to get keys in order
190    let first_obj = json_data[0]
191        .as_object()
192        .context("JSON data must be an array of objects")?;
193
194    let mut table = DataTable::new(table_name);
195
196    // Add metadata
197    table
198        .metadata
199        .insert("source_type".to_string(), "json".to_string());
200    table.metadata.insert(
201        "source_path".to_string(),
202        path.as_ref().display().to_string(),
203    );
204
205    // Get all column names from the first object
206    // This preserves all columns from the JSON data
207    let column_names: Vec<String> = first_obj.keys().cloned().collect();
208    for name in &column_names {
209        table.add_column(DataColumn::new(name));
210    }
211
212    // Collect all values as strings first for type inference
213    let mut string_rows = Vec::new();
214    for json_obj in &json_data {
215        if let Some(obj) = json_obj.as_object() {
216            let mut row = Vec::new();
217            for name in &column_names {
218                let value_str = match obj.get(name) {
219                    Some(JsonValue::Null) | None => String::new(),
220                    Some(JsonValue::Bool(b)) => b.to_string(),
221                    Some(JsonValue::Number(n)) => n.to_string(),
222                    Some(JsonValue::String(s)) => s.clone(),
223                    Some(JsonValue::Array(arr)) => format!("{arr:?}"), // Arrays as debug string for now
224                    Some(JsonValue::Object(obj)) => format!("{obj:?}"), // Objects as debug string for now
225                };
226                row.push(value_str);
227            }
228            string_rows.push(row);
229        }
230    }
231
232    // Infer column types
233    let mut column_types = vec![DataType::Null; column_names.len()];
234    let sample_size = string_rows.len().min(100);
235
236    for row in string_rows.iter().take(sample_size) {
237        for (col_idx, value) in row.iter().enumerate() {
238            if !value.is_empty() {
239                let inferred = DataType::infer_from_string(value);
240                column_types[col_idx] = column_types[col_idx].merge(&inferred);
241            }
242        }
243    }
244
245    // Update column types
246    for (col_idx, column) in table.columns.iter_mut().enumerate() {
247        column.data_type = column_types[col_idx].clone();
248    }
249
250    // Convert to DataRows
251    for string_row in string_rows {
252        let mut values = Vec::new();
253        for (col_idx, value) in string_row.iter().enumerate() {
254            let data_value = DataValue::from_string(value, &column_types[col_idx]);
255            values.push(data_value);
256        }
257        table
258            .add_row(DataRow::new(values))
259            .map_err(|e| anyhow::anyhow!(e))?;
260    }
261
262    // Update statistics
263    table.infer_column_types();
264
265    Ok(table)
266}
267
268/// Load JSON data directly (already parsed) into a `DataTable`
269pub fn load_json_data_to_datatable(data: Vec<JsonValue>, table_name: &str) -> Result<DataTable> {
270    if data.is_empty() {
271        return Ok(DataTable::new(table_name));
272    }
273
274    // Extract column names from all objects (union of all keys)
275    let mut all_columns = HashSet::new();
276    for item in &data {
277        if let Some(obj) = item.as_object() {
278            for key in obj.keys() {
279                all_columns.insert(key.clone());
280            }
281        }
282    }
283
284    let column_names: Vec<String> = all_columns.into_iter().collect();
285    let mut table = DataTable::new(table_name);
286
287    // Add metadata
288    table
289        .metadata
290        .insert("source_type".to_string(), "json_data".to_string());
291
292    // Create columns
293    for name in &column_names {
294        table.add_column(DataColumn::new(name));
295    }
296
297    // Process data similar to file loading
298    let mut string_rows = Vec::new();
299    for json_obj in &data {
300        if let Some(obj) = json_obj.as_object() {
301            let mut row = Vec::new();
302            for name in &column_names {
303                let value_str = match obj.get(name) {
304                    Some(JsonValue::Null) | None => String::new(),
305                    Some(JsonValue::Bool(b)) => b.to_string(),
306                    Some(JsonValue::Number(n)) => n.to_string(),
307                    Some(JsonValue::String(s)) => s.clone(),
308                    Some(JsonValue::Array(arr)) => format!("{arr:?}"),
309                    Some(JsonValue::Object(obj)) => format!("{obj:?}"),
310                };
311                row.push(value_str);
312            }
313            string_rows.push(row);
314        }
315    }
316
317    // Infer types and convert to DataRows (same as above)
318    let mut column_types = vec![DataType::Null; column_names.len()];
319    let sample_size = string_rows.len().min(100);
320
321    for row in string_rows.iter().take(sample_size) {
322        for (col_idx, value) in row.iter().enumerate() {
323            if !value.is_empty() {
324                let inferred = DataType::infer_from_string(value);
325                column_types[col_idx] = column_types[col_idx].merge(&inferred);
326            }
327        }
328    }
329
330    for (col_idx, column) in table.columns.iter_mut().enumerate() {
331        column.data_type = column_types[col_idx].clone();
332    }
333
334    for string_row in string_rows {
335        let mut values = Vec::new();
336        for (col_idx, value) in string_row.iter().enumerate() {
337            let data_value = DataValue::from_string(value, &column_types[col_idx]);
338            values.push(data_value);
339        }
340        table
341            .add_row(DataRow::new(values))
342            .map_err(|e| anyhow::anyhow!(e))?;
343    }
344
345    table.infer_column_types();
346
347    Ok(table)
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353    use std::io::Write;
354    use tempfile::NamedTempFile;
355
356    #[test]
357    fn test_load_csv() -> Result<()> {
358        // Create a temporary CSV file
359        let mut temp_file = NamedTempFile::new()?;
360        writeln!(temp_file, "id,name,price,quantity")?;
361        writeln!(temp_file, "1,Widget,9.99,100")?;
362        writeln!(temp_file, "2,Gadget,19.99,50")?;
363        writeln!(temp_file, "3,Doohickey,5.00,200")?;
364        temp_file.flush()?;
365
366        let table = load_csv_to_datatable(temp_file.path(), "products")?;
367
368        assert_eq!(table.name, "products");
369        assert_eq!(table.column_count(), 4);
370        assert_eq!(table.row_count(), 3);
371
372        // Check column types were inferred correctly
373        assert_eq!(table.columns[0].name, "id");
374        assert_eq!(table.columns[0].data_type, DataType::Integer);
375
376        assert_eq!(table.columns[1].name, "name");
377        assert_eq!(table.columns[1].data_type, DataType::String);
378
379        assert_eq!(table.columns[2].name, "price");
380        assert_eq!(table.columns[2].data_type, DataType::Float);
381
382        assert_eq!(table.columns[3].name, "quantity");
383        assert_eq!(table.columns[3].data_type, DataType::Integer);
384
385        // Check data
386        let value = table.get_value_by_name(0, "name").unwrap();
387        assert_eq!(value.to_string(), "Widget");
388
389        Ok(())
390    }
391
392    #[test]
393    fn test_load_json() -> Result<()> {
394        // Create a temporary JSON file
395        let mut temp_file = NamedTempFile::new()?;
396        writeln!(
397            temp_file,
398            r#"[
399            {{"id": 1, "name": "Alice", "active": true, "score": 95.5}},
400            {{"id": 2, "name": "Bob", "active": false, "score": 87.3}},
401            {{"id": 3, "name": "Charlie", "active": true, "score": null}}
402        ]"#
403        )?;
404        temp_file.flush()?;
405
406        let table = load_json_to_datatable(temp_file.path(), "users")?;
407
408        assert_eq!(table.name, "users");
409        assert_eq!(table.column_count(), 4);
410        assert_eq!(table.row_count(), 3);
411
412        // Check that null handling works
413        let score = table.get_value_by_name(2, "score").unwrap();
414        assert!(score.is_null());
415
416        Ok(())
417    }
418}