sql_cli/data/
datatable_loaders.rs

1use crate::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use anyhow::{Context, Result};
3use csv::ReaderBuilder;
4use serde_json::Value as JsonValue;
5use std::collections::HashSet;
6use std::fs::File;
7use std::io::Read;
8use std::path::Path;
9
10/// Load a CSV file into a DataTable
11pub fn load_csv_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
12    let file = File::open(&path)
13        .with_context(|| format!("Failed to open CSV file: {:?}", path.as_ref()))?;
14
15    let mut reader = ReaderBuilder::new().has_headers(true).from_reader(file);
16
17    // Get headers and create columns
18    let headers = reader.headers()?.clone();
19    let mut table = DataTable::new(table_name);
20
21    // Add metadata about the source
22    table
23        .metadata
24        .insert("source_type".to_string(), "csv".to_string());
25    table.metadata.insert(
26        "source_path".to_string(),
27        path.as_ref().display().to_string(),
28    );
29
30    // Create columns from headers (types will be inferred later)
31    for header in headers.iter() {
32        table.add_column(DataColumn::new(header));
33    }
34
35    // Read all rows first to collect data
36    let mut string_rows = Vec::new();
37    for result in reader.records() {
38        let record = result?;
39        let row: Vec<String> = record.iter().map(|s| s.to_string()).collect();
40        string_rows.push(row);
41    }
42
43    // Infer column types by sampling the data
44    let mut column_types = vec![DataType::Null; headers.len()];
45    let sample_size = string_rows.len().min(100); // Sample first 100 rows for type inference
46
47    for row in string_rows.iter().take(sample_size) {
48        for (col_idx, value) in row.iter().enumerate() {
49            if !value.is_empty() {
50                let inferred = DataType::infer_from_string(value);
51                column_types[col_idx] = column_types[col_idx].merge(&inferred);
52            }
53        }
54    }
55
56    // Update column types
57    for (col_idx, column) in table.columns.iter_mut().enumerate() {
58        column.data_type = column_types[col_idx].clone();
59    }
60
61    // Convert string data to typed DataValues and add rows
62    for string_row in string_rows {
63        let mut values = Vec::new();
64        for (col_idx, value) in string_row.iter().enumerate() {
65            let data_value = DataValue::from_string(value, &column_types[col_idx]);
66            values.push(data_value);
67        }
68        table
69            .add_row(DataRow::new(values))
70            .map_err(|e| anyhow::anyhow!(e))?;
71    }
72
73    // Update column statistics
74    table.infer_column_types();
75
76    Ok(table)
77}
78
79/// Load a JSON file into a DataTable
80pub fn load_json_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
81    // Read file as string first to preserve key order
82    let mut file = File::open(&path)
83        .with_context(|| format!("Failed to open JSON file: {:?}", path.as_ref()))?;
84    let mut json_str = String::new();
85    file.read_to_string(&mut json_str)?;
86
87    // Parse JSON while preserving order using serde_json's preserve_order feature
88    // For now, we'll use a workaround to get keys in original order
89    let json_data: Vec<JsonValue> =
90        serde_json::from_str(&json_str).with_context(|| "Failed to parse JSON file")?;
91
92    if json_data.is_empty() {
93        return Ok(DataTable::new(table_name));
94    }
95
96    // Extract column names from first object, preserving order
97    // Parse the first object manually to get keys in order
98    let first_obj = json_data[0]
99        .as_object()
100        .context("JSON data must be an array of objects")?;
101
102    let mut table = DataTable::new(table_name);
103
104    // Add metadata
105    table
106        .metadata
107        .insert("source_type".to_string(), "json".to_string());
108    table.metadata.insert(
109        "source_path".to_string(),
110        path.as_ref().display().to_string(),
111    );
112
113    // Get all column names from the first object
114    // This preserves all columns from the JSON data
115    let column_names: Vec<String> = first_obj.keys().cloned().collect();
116    for name in &column_names {
117        table.add_column(DataColumn::new(name));
118    }
119
120    // Collect all values as strings first for type inference
121    let mut string_rows = Vec::new();
122    for json_obj in &json_data {
123        if let Some(obj) = json_obj.as_object() {
124            let mut row = Vec::new();
125            for name in &column_names {
126                let value_str = match obj.get(name) {
127                    Some(JsonValue::Null) | None => String::new(),
128                    Some(JsonValue::Bool(b)) => b.to_string(),
129                    Some(JsonValue::Number(n)) => n.to_string(),
130                    Some(JsonValue::String(s)) => s.clone(),
131                    Some(JsonValue::Array(arr)) => format!("{:?}", arr), // Arrays as debug string for now
132                    Some(JsonValue::Object(obj)) => format!("{:?}", obj), // Objects as debug string for now
133                };
134                row.push(value_str);
135            }
136            string_rows.push(row);
137        }
138    }
139
140    // Infer column types
141    let mut column_types = vec![DataType::Null; column_names.len()];
142    let sample_size = string_rows.len().min(100);
143
144    for row in string_rows.iter().take(sample_size) {
145        for (col_idx, value) in row.iter().enumerate() {
146            if !value.is_empty() {
147                let inferred = DataType::infer_from_string(value);
148                column_types[col_idx] = column_types[col_idx].merge(&inferred);
149            }
150        }
151    }
152
153    // Update column types
154    for (col_idx, column) in table.columns.iter_mut().enumerate() {
155        column.data_type = column_types[col_idx].clone();
156    }
157
158    // Convert to DataRows
159    for string_row in string_rows {
160        let mut values = Vec::new();
161        for (col_idx, value) in string_row.iter().enumerate() {
162            let data_value = DataValue::from_string(value, &column_types[col_idx]);
163            values.push(data_value);
164        }
165        table
166            .add_row(DataRow::new(values))
167            .map_err(|e| anyhow::anyhow!(e))?;
168    }
169
170    // Update statistics
171    table.infer_column_types();
172
173    Ok(table)
174}
175
176/// Load JSON data directly (already parsed) into a DataTable
177pub fn load_json_data_to_datatable(data: Vec<JsonValue>, table_name: &str) -> Result<DataTable> {
178    if data.is_empty() {
179        return Ok(DataTable::new(table_name));
180    }
181
182    // Extract column names from all objects (union of all keys)
183    let mut all_columns = HashSet::new();
184    for item in &data {
185        if let Some(obj) = item.as_object() {
186            for key in obj.keys() {
187                all_columns.insert(key.clone());
188            }
189        }
190    }
191
192    let column_names: Vec<String> = all_columns.into_iter().collect();
193    let mut table = DataTable::new(table_name);
194
195    // Add metadata
196    table
197        .metadata
198        .insert("source_type".to_string(), "json_data".to_string());
199
200    // Create columns
201    for name in &column_names {
202        table.add_column(DataColumn::new(name));
203    }
204
205    // Process data similar to file loading
206    let mut string_rows = Vec::new();
207    for json_obj in &data {
208        if let Some(obj) = json_obj.as_object() {
209            let mut row = Vec::new();
210            for name in &column_names {
211                let value_str = match obj.get(name) {
212                    Some(JsonValue::Null) | None => String::new(),
213                    Some(JsonValue::Bool(b)) => b.to_string(),
214                    Some(JsonValue::Number(n)) => n.to_string(),
215                    Some(JsonValue::String(s)) => s.clone(),
216                    Some(JsonValue::Array(arr)) => format!("{:?}", arr),
217                    Some(JsonValue::Object(obj)) => format!("{:?}", obj),
218                };
219                row.push(value_str);
220            }
221            string_rows.push(row);
222        }
223    }
224
225    // Infer types and convert to DataRows (same as above)
226    let mut column_types = vec![DataType::Null; column_names.len()];
227    let sample_size = string_rows.len().min(100);
228
229    for row in string_rows.iter().take(sample_size) {
230        for (col_idx, value) in row.iter().enumerate() {
231            if !value.is_empty() {
232                let inferred = DataType::infer_from_string(value);
233                column_types[col_idx] = column_types[col_idx].merge(&inferred);
234            }
235        }
236    }
237
238    for (col_idx, column) in table.columns.iter_mut().enumerate() {
239        column.data_type = column_types[col_idx].clone();
240    }
241
242    for string_row in string_rows {
243        let mut values = Vec::new();
244        for (col_idx, value) in string_row.iter().enumerate() {
245            let data_value = DataValue::from_string(value, &column_types[col_idx]);
246            values.push(data_value);
247        }
248        table
249            .add_row(DataRow::new(values))
250            .map_err(|e| anyhow::anyhow!(e))?;
251    }
252
253    table.infer_column_types();
254
255    Ok(table)
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use std::io::Write;
262    use tempfile::NamedTempFile;
263
264    #[test]
265    fn test_load_csv() -> Result<()> {
266        // Create a temporary CSV file
267        let mut temp_file = NamedTempFile::new()?;
268        writeln!(temp_file, "id,name,price,quantity")?;
269        writeln!(temp_file, "1,Widget,9.99,100")?;
270        writeln!(temp_file, "2,Gadget,19.99,50")?;
271        writeln!(temp_file, "3,Doohickey,5.00,200")?;
272        temp_file.flush()?;
273
274        let table = load_csv_to_datatable(temp_file.path(), "products")?;
275
276        assert_eq!(table.name, "products");
277        assert_eq!(table.column_count(), 4);
278        assert_eq!(table.row_count(), 3);
279
280        // Check column types were inferred correctly
281        assert_eq!(table.columns[0].name, "id");
282        assert_eq!(table.columns[0].data_type, DataType::Integer);
283
284        assert_eq!(table.columns[1].name, "name");
285        assert_eq!(table.columns[1].data_type, DataType::String);
286
287        assert_eq!(table.columns[2].name, "price");
288        assert_eq!(table.columns[2].data_type, DataType::Float);
289
290        assert_eq!(table.columns[3].name, "quantity");
291        assert_eq!(table.columns[3].data_type, DataType::Integer);
292
293        // Check data
294        let value = table.get_value_by_name(0, "name").unwrap();
295        assert_eq!(value.to_string(), "Widget");
296
297        Ok(())
298    }
299
300    #[test]
301    fn test_load_json() -> Result<()> {
302        // Create a temporary JSON file
303        let mut temp_file = NamedTempFile::new()?;
304        writeln!(
305            temp_file,
306            r#"[
307            {{"id": 1, "name": "Alice", "active": true, "score": 95.5}},
308            {{"id": 2, "name": "Bob", "active": false, "score": 87.3}},
309            {{"id": 3, "name": "Charlie", "active": true, "score": null}}
310        ]"#
311        )?;
312        temp_file.flush()?;
313
314        let table = load_json_to_datatable(temp_file.path(), "users")?;
315
316        assert_eq!(table.name, "users");
317        assert_eq!(table.column_count(), 4);
318        assert_eq!(table.row_count(), 3);
319
320        // Check that null handling works
321        let score = table.get_value_by_name(2, "score").unwrap();
322        assert!(score.is_null());
323
324        Ok(())
325    }
326}