sql_cli/data/
datatable_loaders.rs

1use crate::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use anyhow::{Context, Result};
3use csv::ReaderBuilder;
4use serde_json::Value as JsonValue;
5use std::collections::HashSet;
6use std::fs::File;
7use std::io::Read;
8use std::path::Path;
9
10/// Load a CSV file into a `DataTable`
11pub fn load_csv_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
12    let file = File::open(&path)
13        .with_context(|| format!("Failed to open CSV file: {:?}", path.as_ref()))?;
14
15    let mut reader = ReaderBuilder::new().has_headers(true).from_reader(file);
16
17    // Get headers and create columns
18    let headers = reader.headers()?.clone();
19    let mut table = DataTable::new(table_name);
20
21    // Add metadata about the source
22    table
23        .metadata
24        .insert("source_type".to_string(), "csv".to_string());
25    table.metadata.insert(
26        "source_path".to_string(),
27        path.as_ref().display().to_string(),
28    );
29
30    // Create columns from headers (types will be inferred later)
31    for header in &headers {
32        table.add_column(DataColumn::new(header));
33    }
34
35    // Read all rows first to collect data
36    let mut string_rows = Vec::new();
37    for result in reader.records() {
38        let record = result?;
39        let row: Vec<String> = record
40            .iter()
41            .map(std::string::ToString::to_string)
42            .collect();
43        string_rows.push(row);
44    }
45
46    // Infer column types by sampling the data
47    let mut column_types = vec![DataType::Null; headers.len()];
48    let sample_size = string_rows.len().min(100); // Sample first 100 rows for type inference
49
50    for row in string_rows.iter().take(sample_size) {
51        for (col_idx, value) in row.iter().enumerate() {
52            if !value.is_empty() {
53                let inferred = DataType::infer_from_string(value);
54                column_types[col_idx] = column_types[col_idx].merge(&inferred);
55            }
56        }
57    }
58
59    // Update column types
60    for (col_idx, column) in table.columns.iter_mut().enumerate() {
61        column.data_type = column_types[col_idx].clone();
62    }
63
64    // Convert string data to typed DataValues and add rows
65    for string_row in string_rows {
66        let mut values = Vec::new();
67        for (col_idx, value) in string_row.iter().enumerate() {
68            let data_value = DataValue::from_string(value, &column_types[col_idx]);
69            values.push(data_value);
70        }
71        table
72            .add_row(DataRow::new(values))
73            .map_err(|e| anyhow::anyhow!(e))?;
74    }
75
76    // Update column statistics
77    table.infer_column_types();
78
79    Ok(table)
80}
81
82/// Load a JSON file into a `DataTable`
83pub fn load_json_to_datatable<P: AsRef<Path>>(path: P, table_name: &str) -> Result<DataTable> {
84    // Read file as string first to preserve key order
85    let mut file = File::open(&path)
86        .with_context(|| format!("Failed to open JSON file: {:?}", path.as_ref()))?;
87    let mut json_str = String::new();
88    file.read_to_string(&mut json_str)?;
89
90    // Parse JSON while preserving order using serde_json's preserve_order feature
91    // For now, we'll use a workaround to get keys in original order
92    let json_data: Vec<JsonValue> =
93        serde_json::from_str(&json_str).with_context(|| "Failed to parse JSON file")?;
94
95    if json_data.is_empty() {
96        return Ok(DataTable::new(table_name));
97    }
98
99    // Extract column names from first object, preserving order
100    // Parse the first object manually to get keys in order
101    let first_obj = json_data[0]
102        .as_object()
103        .context("JSON data must be an array of objects")?;
104
105    let mut table = DataTable::new(table_name);
106
107    // Add metadata
108    table
109        .metadata
110        .insert("source_type".to_string(), "json".to_string());
111    table.metadata.insert(
112        "source_path".to_string(),
113        path.as_ref().display().to_string(),
114    );
115
116    // Get all column names from the first object
117    // This preserves all columns from the JSON data
118    let column_names: Vec<String> = first_obj.keys().cloned().collect();
119    for name in &column_names {
120        table.add_column(DataColumn::new(name));
121    }
122
123    // Collect all values as strings first for type inference
124    let mut string_rows = Vec::new();
125    for json_obj in &json_data {
126        if let Some(obj) = json_obj.as_object() {
127            let mut row = Vec::new();
128            for name in &column_names {
129                let value_str = match obj.get(name) {
130                    Some(JsonValue::Null) | None => String::new(),
131                    Some(JsonValue::Bool(b)) => b.to_string(),
132                    Some(JsonValue::Number(n)) => n.to_string(),
133                    Some(JsonValue::String(s)) => s.clone(),
134                    Some(JsonValue::Array(arr)) => format!("{arr:?}"), // Arrays as debug string for now
135                    Some(JsonValue::Object(obj)) => format!("{obj:?}"), // Objects as debug string for now
136                };
137                row.push(value_str);
138            }
139            string_rows.push(row);
140        }
141    }
142
143    // Infer column types
144    let mut column_types = vec![DataType::Null; column_names.len()];
145    let sample_size = string_rows.len().min(100);
146
147    for row in string_rows.iter().take(sample_size) {
148        for (col_idx, value) in row.iter().enumerate() {
149            if !value.is_empty() {
150                let inferred = DataType::infer_from_string(value);
151                column_types[col_idx] = column_types[col_idx].merge(&inferred);
152            }
153        }
154    }
155
156    // Update column types
157    for (col_idx, column) in table.columns.iter_mut().enumerate() {
158        column.data_type = column_types[col_idx].clone();
159    }
160
161    // Convert to DataRows
162    for string_row in string_rows {
163        let mut values = Vec::new();
164        for (col_idx, value) in string_row.iter().enumerate() {
165            let data_value = DataValue::from_string(value, &column_types[col_idx]);
166            values.push(data_value);
167        }
168        table
169            .add_row(DataRow::new(values))
170            .map_err(|e| anyhow::anyhow!(e))?;
171    }
172
173    // Update statistics
174    table.infer_column_types();
175
176    Ok(table)
177}
178
179/// Load JSON data directly (already parsed) into a `DataTable`
180pub fn load_json_data_to_datatable(data: Vec<JsonValue>, table_name: &str) -> Result<DataTable> {
181    if data.is_empty() {
182        return Ok(DataTable::new(table_name));
183    }
184
185    // Extract column names from all objects (union of all keys)
186    let mut all_columns = HashSet::new();
187    for item in &data {
188        if let Some(obj) = item.as_object() {
189            for key in obj.keys() {
190                all_columns.insert(key.clone());
191            }
192        }
193    }
194
195    let column_names: Vec<String> = all_columns.into_iter().collect();
196    let mut table = DataTable::new(table_name);
197
198    // Add metadata
199    table
200        .metadata
201        .insert("source_type".to_string(), "json_data".to_string());
202
203    // Create columns
204    for name in &column_names {
205        table.add_column(DataColumn::new(name));
206    }
207
208    // Process data similar to file loading
209    let mut string_rows = Vec::new();
210    for json_obj in &data {
211        if let Some(obj) = json_obj.as_object() {
212            let mut row = Vec::new();
213            for name in &column_names {
214                let value_str = match obj.get(name) {
215                    Some(JsonValue::Null) | None => String::new(),
216                    Some(JsonValue::Bool(b)) => b.to_string(),
217                    Some(JsonValue::Number(n)) => n.to_string(),
218                    Some(JsonValue::String(s)) => s.clone(),
219                    Some(JsonValue::Array(arr)) => format!("{arr:?}"),
220                    Some(JsonValue::Object(obj)) => format!("{obj:?}"),
221                };
222                row.push(value_str);
223            }
224            string_rows.push(row);
225        }
226    }
227
228    // Infer types and convert to DataRows (same as above)
229    let mut column_types = vec![DataType::Null; column_names.len()];
230    let sample_size = string_rows.len().min(100);
231
232    for row in string_rows.iter().take(sample_size) {
233        for (col_idx, value) in row.iter().enumerate() {
234            if !value.is_empty() {
235                let inferred = DataType::infer_from_string(value);
236                column_types[col_idx] = column_types[col_idx].merge(&inferred);
237            }
238        }
239    }
240
241    for (col_idx, column) in table.columns.iter_mut().enumerate() {
242        column.data_type = column_types[col_idx].clone();
243    }
244
245    for string_row in string_rows {
246        let mut values = Vec::new();
247        for (col_idx, value) in string_row.iter().enumerate() {
248            let data_value = DataValue::from_string(value, &column_types[col_idx]);
249            values.push(data_value);
250        }
251        table
252            .add_row(DataRow::new(values))
253            .map_err(|e| anyhow::anyhow!(e))?;
254    }
255
256    table.infer_column_types();
257
258    Ok(table)
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264    use std::io::Write;
265    use tempfile::NamedTempFile;
266
267    #[test]
268    fn test_load_csv() -> Result<()> {
269        // Create a temporary CSV file
270        let mut temp_file = NamedTempFile::new()?;
271        writeln!(temp_file, "id,name,price,quantity")?;
272        writeln!(temp_file, "1,Widget,9.99,100")?;
273        writeln!(temp_file, "2,Gadget,19.99,50")?;
274        writeln!(temp_file, "3,Doohickey,5.00,200")?;
275        temp_file.flush()?;
276
277        let table = load_csv_to_datatable(temp_file.path(), "products")?;
278
279        assert_eq!(table.name, "products");
280        assert_eq!(table.column_count(), 4);
281        assert_eq!(table.row_count(), 3);
282
283        // Check column types were inferred correctly
284        assert_eq!(table.columns[0].name, "id");
285        assert_eq!(table.columns[0].data_type, DataType::Integer);
286
287        assert_eq!(table.columns[1].name, "name");
288        assert_eq!(table.columns[1].data_type, DataType::String);
289
290        assert_eq!(table.columns[2].name, "price");
291        assert_eq!(table.columns[2].data_type, DataType::Float);
292
293        assert_eq!(table.columns[3].name, "quantity");
294        assert_eq!(table.columns[3].data_type, DataType::Integer);
295
296        // Check data
297        let value = table.get_value_by_name(0, "name").unwrap();
298        assert_eq!(value.to_string(), "Widget");
299
300        Ok(())
301    }
302
303    #[test]
304    fn test_load_json() -> Result<()> {
305        // Create a temporary JSON file
306        let mut temp_file = NamedTempFile::new()?;
307        writeln!(
308            temp_file,
309            r#"[
310            {{"id": 1, "name": "Alice", "active": true, "score": 95.5}},
311            {{"id": 2, "name": "Bob", "active": false, "score": 87.3}},
312            {{"id": 3, "name": "Charlie", "active": true, "score": null}}
313        ]"#
314        )?;
315        temp_file.flush()?;
316
317        let table = load_json_to_datatable(temp_file.path(), "users")?;
318
319        assert_eq!(table.name, "users");
320        assert_eq!(table.column_count(), 4);
321        assert_eq!(table.row_count(), 3);
322
323        // Check that null handling works
324        let score = table.get_value_by_name(2, "score").unwrap();
325        assert!(score.is_null());
326
327        Ok(())
328    }
329}