sql_cli/data/
datatable_converter.rs

1use crate::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use anyhow::Result;
3use serde_json::Value;
4use std::collections::HashMap;
5use tracing::{debug, info, trace};
6
7/// Convert JSON values (from CSV or JSON sources) to `DataTable`
8pub struct DataTableConverter;
9
10impl DataTableConverter {
11    /// Convert a vector of JSON values to a `DataTable`
12    pub fn from_json_values(values: &[Value], table_name: impl Into<String>) -> Result<DataTable> {
13        let table_name = table_name.into();
14        info!(target: "datatable", "Converting {} JSON values to DataTable '{}'", 
15            values.len(), table_name);
16
17        if values.is_empty() {
18            return Ok(DataTable::new(table_name));
19        }
20
21        // Extract column names from first row
22        let column_names = Self::extract_column_names(values)?;
23        debug!(target: "datatable", "Found {} columns: {:?}", 
24            column_names.len(), column_names);
25
26        // Infer column types by sampling rows
27        let column_types = Self::infer_column_types(values, &column_names);
28
29        // Log inferred types
30        for (name, dtype) in column_names.iter().zip(column_types.iter()) {
31            debug!(target: "datatable", "Column '{}' inferred as {:?}", name, dtype);
32        }
33
34        // Create DataTable with columns
35        let mut table = DataTable::new(table_name);
36        for (name, data_type) in column_names.iter().zip(column_types.iter()) {
37            table.add_column(
38                DataColumn::new(name.clone())
39                    .with_type(data_type.clone())
40                    .with_nullable(true), // For now, all columns are nullable
41            );
42        }
43
44        // Convert rows
45        for json_row in values {
46            let data_row = Self::convert_json_row(json_row, &column_names, &column_types)?;
47            table
48                .add_row(data_row)
49                .map_err(|e| anyhow::anyhow!("Failed to add row: {}", e))?;
50        }
51
52        // Update column statistics
53        Self::update_column_stats(&mut table);
54
55        info!(target: "datatable", "DataTable '{}' created: {} rows, {} columns", 
56            table.name, table.row_count(), table.column_count());
57
58        Ok(table)
59    }
60
61    /// Convert from CSV lines (headers + rows of string values)
62    pub fn from_csv_data(
63        headers: Vec<String>,
64        rows: Vec<Vec<String>>,
65        table_name: impl Into<String>,
66    ) -> Result<DataTable> {
67        // First pass: infer column types
68        let column_types = Self::infer_types_from_strings(&headers, &rows);
69
70        // Create DataTable
71        let mut table = DataTable::new(table_name);
72        for (name, data_type) in headers.iter().zip(column_types.iter()) {
73            table.add_column(
74                DataColumn::new(name.clone())
75                    .with_type(data_type.clone())
76                    .with_nullable(true),
77            );
78        }
79
80        // Convert rows
81        for row_values in rows {
82            let mut data_values = Vec::new();
83            for (value, data_type) in row_values.iter().zip(column_types.iter()) {
84                data_values.push(DataValue::from_string(value, data_type));
85            }
86            table
87                .add_row(DataRow::new(data_values))
88                .map_err(|e| anyhow::anyhow!("Failed to add row: {}", e))?;
89        }
90
91        // Update column statistics
92        Self::update_column_stats(&mut table);
93
94        info!(target: "datatable", "DataTable '{}' created: {} rows, {} columns", 
95            table.name, table.row_count(), table.column_count());
96
97        Ok(table)
98    }
99
100    /// Extract column names from JSON values
101    fn extract_column_names(values: &[Value]) -> Result<Vec<String>> {
102        if let Some(first) = values.first() {
103            if let Some(obj) = first.as_object() {
104                Ok(obj.keys().cloned().collect())
105            } else {
106                Err(anyhow::anyhow!("JSON values must be objects"))
107            }
108        } else {
109            Ok(vec![])
110        }
111    }
112
113    /// Infer column types by sampling JSON values
114    fn infer_column_types(values: &[Value], column_names: &[String]) -> Vec<DataType> {
115        let mut column_types = vec![DataType::Null; column_names.len()];
116
117        // Sample up to 100 rows for type inference
118        let sample_size = values.len().min(100);
119        trace!(target: "datatable", "Sampling {} rows for type inference", sample_size);
120
121        for row in values.iter().take(sample_size) {
122            if let Some(obj) = row.as_object() {
123                for (idx, col_name) in column_names.iter().enumerate() {
124                    if let Some(value) = obj.get(col_name) {
125                        let value_type = Self::infer_json_type(value);
126                        column_types[idx] = column_types[idx].merge(&value_type);
127                    }
128                }
129            }
130        }
131
132        // Default any remaining Null types to String
133        for col_type in &mut column_types {
134            if *col_type == DataType::Null {
135                *col_type = DataType::String;
136            }
137        }
138
139        column_types
140    }
141
142    /// Infer `DataType` from a JSON value
143    fn infer_json_type(value: &Value) -> DataType {
144        match value {
145            Value::Null => DataType::Null,
146            Value::Bool(_) => DataType::Boolean,
147            Value::Number(n) => {
148                if n.is_i64() || n.is_u64() {
149                    DataType::Integer
150                } else {
151                    DataType::Float
152                }
153            }
154            Value::String(s) => DataType::infer_from_string(s),
155            Value::Array(_) | Value::Object(_) => DataType::String, // Serialize as JSON string
156        }
157    }
158
159    /// Convert a JSON row to `DataRow`
160    fn convert_json_row(
161        json_row: &Value,
162        column_names: &[String],
163        column_types: &[DataType],
164    ) -> Result<DataRow> {
165        let mut values = Vec::new();
166
167        if let Some(obj) = json_row.as_object() {
168            for (col_name, col_type) in column_names.iter().zip(column_types.iter()) {
169                let value = obj
170                    .get(col_name)
171                    .map_or(DataValue::Null, |v| Self::json_to_datavalue(v, col_type));
172                values.push(value);
173            }
174        } else {
175            return Err(anyhow::anyhow!("JSON row must be an object"));
176        }
177
178        Ok(DataRow::new(values))
179    }
180
181    /// Convert JSON value to `DataValue`
182    fn json_to_datavalue(json_val: &Value, expected_type: &DataType) -> DataValue {
183        match json_val {
184            Value::Null => DataValue::Null,
185            Value::Bool(b) => DataValue::Boolean(*b),
186            Value::Number(n) => match expected_type {
187                DataType::Integer => n.as_i64().map_or(DataValue::Null, DataValue::Integer),
188                DataType::Float => n.as_f64().map_or(DataValue::Null, DataValue::Float),
189                _ => DataValue::String(n.to_string()),
190            },
191            Value::String(s) => DataValue::from_string(s, expected_type),
192            Value::Array(_) | Value::Object(_) => {
193                // Serialize complex types as JSON strings
194                DataValue::String(json_val.to_string())
195            }
196        }
197    }
198
199    /// Infer types from string data (for CSV)
200    fn infer_types_from_strings(headers: &[String], rows: &[Vec<String>]) -> Vec<DataType> {
201        let mut column_types = vec![DataType::Null; headers.len()];
202
203        // Sample up to 100 rows
204        let sample_size = rows.len().min(100);
205
206        for row in rows.iter().take(sample_size) {
207            for (idx, value) in row.iter().enumerate() {
208                if idx < column_types.len() {
209                    let value_type = DataType::infer_from_string(value);
210                    column_types[idx] = column_types[idx].merge(&value_type);
211                }
212            }
213        }
214
215        // Default any remaining Null types to String
216        for col_type in &mut column_types {
217            if *col_type == DataType::Null {
218                *col_type = DataType::String;
219            }
220        }
221
222        column_types
223    }
224
225    /// Update column statistics (null counts, unique values, etc.)
226    fn update_column_stats(table: &mut DataTable) {
227        for (col_idx, column) in table.columns.iter_mut().enumerate() {
228            let mut null_count = 0;
229            let mut unique_values = HashMap::new();
230
231            for row in &table.rows {
232                if let Some(value) = row.get(col_idx) {
233                    if value.is_null() {
234                        null_count += 1;
235                    } else {
236                        unique_values.insert(value.to_string(), ());
237                    }
238                }
239            }
240
241            column.null_count = null_count;
242            column.unique_values = Some(unique_values.len());
243        }
244    }
245
246    /// Debug print a `DataTable` (for testing)
247    pub fn debug_print(table: &DataTable) {
248        println!("DataTable: {}", table.name);
249        println!("Columns: {}", table.column_count());
250        println!("Rows: {}", table.row_count());
251
252        // Print column info
253        for col in &table.columns {
254            println!(
255                "  - {} ({:?}, nulls: {}, unique: {:?})",
256                col.name, col.data_type, col.null_count, col.unique_values
257            );
258        }
259
260        // Print first 5 rows
261        println!("\nFirst 5 rows:");
262        for (idx, row) in table.rows.iter().take(5).enumerate() {
263            print!("  Row {idx}: ");
264            for value in &row.values {
265                print!("{value}, ");
266            }
267            println!();
268        }
269    }
270}