sql_cli/data/
datatable_converter.rs

1use crate::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use anyhow::Result;
3use serde_json::Value;
4use std::collections::HashMap;
5use tracing::{debug, info, trace};
6
7/// Convert JSON values (from CSV or JSON sources) to DataTable
8pub struct DataTableConverter;
9
10impl DataTableConverter {
11    /// Convert a vector of JSON values to a DataTable
12    pub fn from_json_values(values: &[Value], table_name: impl Into<String>) -> Result<DataTable> {
13        let table_name = table_name.into();
14        info!(target: "datatable", "Converting {} JSON values to DataTable '{}'", 
15            values.len(), table_name);
16
17        if values.is_empty() {
18            return Ok(DataTable::new(table_name));
19        }
20
21        // Extract column names from first row
22        let column_names = Self::extract_column_names(values)?;
23        debug!(target: "datatable", "Found {} columns: {:?}", 
24            column_names.len(), column_names);
25
26        // Infer column types by sampling rows
27        let column_types = Self::infer_column_types(values, &column_names);
28
29        // Log inferred types
30        for (name, dtype) in column_names.iter().zip(column_types.iter()) {
31            debug!(target: "datatable", "Column '{}' inferred as {:?}", name, dtype);
32        }
33
34        // Create DataTable with columns
35        let mut table = DataTable::new(table_name);
36        for (name, data_type) in column_names.iter().zip(column_types.iter()) {
37            table.add_column(
38                DataColumn::new(name.clone())
39                    .with_type(data_type.clone())
40                    .with_nullable(true), // For now, all columns are nullable
41            );
42        }
43
44        // Convert rows
45        for json_row in values {
46            let data_row = Self::convert_json_row(json_row, &column_names, &column_types)?;
47            table
48                .add_row(data_row)
49                .map_err(|e| anyhow::anyhow!("Failed to add row: {}", e))?;
50        }
51
52        // Update column statistics
53        Self::update_column_stats(&mut table);
54
55        info!(target: "datatable", "DataTable '{}' created: {} rows, {} columns", 
56            table.name, table.row_count(), table.column_count());
57
58        Ok(table)
59    }
60
61    /// Convert from CSV lines (headers + rows of string values)
62    pub fn from_csv_data(
63        headers: Vec<String>,
64        rows: Vec<Vec<String>>,
65        table_name: impl Into<String>,
66    ) -> Result<DataTable> {
67        // First pass: infer column types
68        let column_types = Self::infer_types_from_strings(&headers, &rows);
69
70        // Create DataTable
71        let mut table = DataTable::new(table_name);
72        for (name, data_type) in headers.iter().zip(column_types.iter()) {
73            table.add_column(
74                DataColumn::new(name.clone())
75                    .with_type(data_type.clone())
76                    .with_nullable(true),
77            );
78        }
79
80        // Convert rows
81        for row_values in rows {
82            let mut data_values = Vec::new();
83            for (value, data_type) in row_values.iter().zip(column_types.iter()) {
84                data_values.push(DataValue::from_string(value, data_type));
85            }
86            table
87                .add_row(DataRow::new(data_values))
88                .map_err(|e| anyhow::anyhow!("Failed to add row: {}", e))?;
89        }
90
91        // Update column statistics
92        Self::update_column_stats(&mut table);
93
94        info!(target: "datatable", "DataTable '{}' created: {} rows, {} columns", 
95            table.name, table.row_count(), table.column_count());
96
97        Ok(table)
98    }
99
100    /// Extract column names from JSON values
101    fn extract_column_names(values: &[Value]) -> Result<Vec<String>> {
102        if let Some(first) = values.first() {
103            if let Some(obj) = first.as_object() {
104                Ok(obj.keys().cloned().collect())
105            } else {
106                Err(anyhow::anyhow!("JSON values must be objects"))
107            }
108        } else {
109            Ok(vec![])
110        }
111    }
112
113    /// Infer column types by sampling JSON values
114    fn infer_column_types(values: &[Value], column_names: &[String]) -> Vec<DataType> {
115        let mut column_types = vec![DataType::Null; column_names.len()];
116
117        // Sample up to 100 rows for type inference
118        let sample_size = values.len().min(100);
119        trace!(target: "datatable", "Sampling {} rows for type inference", sample_size);
120
121        for row in values.iter().take(sample_size) {
122            if let Some(obj) = row.as_object() {
123                for (idx, col_name) in column_names.iter().enumerate() {
124                    if let Some(value) = obj.get(col_name) {
125                        let value_type = Self::infer_json_type(value);
126                        column_types[idx] = column_types[idx].merge(&value_type);
127                    }
128                }
129            }
130        }
131
132        // Default any remaining Null types to String
133        for col_type in &mut column_types {
134            if *col_type == DataType::Null {
135                *col_type = DataType::String;
136            }
137        }
138
139        column_types
140    }
141
142    /// Infer DataType from a JSON value
143    fn infer_json_type(value: &Value) -> DataType {
144        match value {
145            Value::Null => DataType::Null,
146            Value::Bool(_) => DataType::Boolean,
147            Value::Number(n) => {
148                if n.is_i64() || n.is_u64() {
149                    DataType::Integer
150                } else {
151                    DataType::Float
152                }
153            }
154            Value::String(s) => DataType::infer_from_string(s),
155            Value::Array(_) | Value::Object(_) => DataType::String, // Serialize as JSON string
156        }
157    }
158
159    /// Convert a JSON row to DataRow
160    fn convert_json_row(
161        json_row: &Value,
162        column_names: &[String],
163        column_types: &[DataType],
164    ) -> Result<DataRow> {
165        let mut values = Vec::new();
166
167        if let Some(obj) = json_row.as_object() {
168            for (col_name, col_type) in column_names.iter().zip(column_types.iter()) {
169                let value = obj
170                    .get(col_name)
171                    .map(|v| Self::json_to_datavalue(v, col_type))
172                    .unwrap_or(DataValue::Null);
173                values.push(value);
174            }
175        } else {
176            return Err(anyhow::anyhow!("JSON row must be an object"));
177        }
178
179        Ok(DataRow::new(values))
180    }
181
182    /// Convert JSON value to DataValue
183    fn json_to_datavalue(json_val: &Value, expected_type: &DataType) -> DataValue {
184        match json_val {
185            Value::Null => DataValue::Null,
186            Value::Bool(b) => DataValue::Boolean(*b),
187            Value::Number(n) => match expected_type {
188                DataType::Integer => n
189                    .as_i64()
190                    .map(DataValue::Integer)
191                    .unwrap_or(DataValue::Null),
192                DataType::Float => n.as_f64().map(DataValue::Float).unwrap_or(DataValue::Null),
193                _ => DataValue::String(n.to_string()),
194            },
195            Value::String(s) => DataValue::from_string(s, expected_type),
196            Value::Array(_) | Value::Object(_) => {
197                // Serialize complex types as JSON strings
198                DataValue::String(json_val.to_string())
199            }
200        }
201    }
202
203    /// Infer types from string data (for CSV)
204    fn infer_types_from_strings(headers: &[String], rows: &[Vec<String>]) -> Vec<DataType> {
205        let mut column_types = vec![DataType::Null; headers.len()];
206
207        // Sample up to 100 rows
208        let sample_size = rows.len().min(100);
209
210        for row in rows.iter().take(sample_size) {
211            for (idx, value) in row.iter().enumerate() {
212                if idx < column_types.len() {
213                    let value_type = DataType::infer_from_string(value);
214                    column_types[idx] = column_types[idx].merge(&value_type);
215                }
216            }
217        }
218
219        // Default any remaining Null types to String
220        for col_type in &mut column_types {
221            if *col_type == DataType::Null {
222                *col_type = DataType::String;
223            }
224        }
225
226        column_types
227    }
228
229    /// Update column statistics (null counts, unique values, etc.)
230    fn update_column_stats(table: &mut DataTable) {
231        for (col_idx, column) in table.columns.iter_mut().enumerate() {
232            let mut null_count = 0;
233            let mut unique_values = HashMap::new();
234
235            for row in &table.rows {
236                if let Some(value) = row.get(col_idx) {
237                    if value.is_null() {
238                        null_count += 1;
239                    } else {
240                        unique_values.insert(value.to_string(), ());
241                    }
242                }
243            }
244
245            column.null_count = null_count;
246            column.unique_values = Some(unique_values.len());
247        }
248    }
249
250    /// Debug print a DataTable (for testing)
251    pub fn debug_print(table: &DataTable) {
252        println!("DataTable: {}", table.name);
253        println!("Columns: {}", table.column_count());
254        println!("Rows: {}", table.row_count());
255
256        // Print column info
257        for col in &table.columns {
258            println!(
259                "  - {} ({:?}, nulls: {}, unique: {:?})",
260                col.name, col.data_type, col.null_count, col.unique_values
261            );
262        }
263
264        // Print first 5 rows
265        println!("\nFirst 5 rows:");
266        for (idx, row) in table.rows.iter().take(5).enumerate() {
267            print!("  Row {}: ", idx);
268            for value in &row.values {
269                print!("{}, ", value);
270            }
271            println!();
272        }
273    }
274}