Skip to main content

nexcore_dataframe/io/
json.rs

1//! JSON I/O for DataFrames.
2//!
3//! Row-oriented JSON: each row is a JSON object with column names as keys.
4//! This matches the format used by polars `JsonWriter` / `JsonReader`.
5
6use std::io::{Read, Write};
7
8use serde_json::Value;
9
10use crate::column::Column;
11use crate::dataframe::DataFrame;
12use crate::error::DataFrameError;
13use crate::scalar::Scalar;
14
15impl DataFrame {
16    /// Serialize the DataFrame to a JSON string (array of row objects).
17    pub fn to_json(&self) -> Result<String, DataFrameError> {
18        let rows = self.to_json_rows();
19        let val = Value::Array(rows);
20        serde_json::to_string_pretty(&val).map_err(DataFrameError::from)
21    }
22
23    /// Serialize the DataFrame to a writer.
24    pub fn to_json_writer<W: Write>(&self, writer: W) -> Result<(), DataFrameError> {
25        let rows = self.to_json_rows();
26        let val = Value::Array(rows);
27        serde_json::to_writer_pretty(writer, &val).map_err(DataFrameError::from)
28    }
29
30    /// Write to a JSON file at the given path.
31    pub fn to_json_file(&self, path: &std::path::Path) -> Result<(), DataFrameError> {
32        let file = std::fs::File::create(path)?;
33        let writer = std::io::BufWriter::new(file);
34        self.to_json_writer(writer)
35    }
36
37    /// Deserialize a DataFrame from a JSON string (array of row objects).
38    pub fn from_json(json: &str) -> Result<Self, DataFrameError> {
39        let val: Value = serde_json::from_str(json)?;
40        Self::from_json_value(&val)
41    }
42
43    /// Deserialize a DataFrame from a reader.
44    pub fn from_json_reader<R: Read>(reader: R) -> Result<Self, DataFrameError> {
45        let val: Value = serde_json::from_reader(reader)?;
46        Self::from_json_value(&val)
47    }
48
49    /// Convert DataFrame rows to JSON Value objects.
50    fn to_json_rows(&self) -> Vec<Value> {
51        let mut rows = Vec::with_capacity(self.height());
52        let names = self.column_names();
53
54        for i in 0..self.height() {
55            let mut map = serde_json::Map::new();
56            for (col_idx, name) in names.iter().enumerate() {
57                let val = self.columns().get(col_idx).and_then(|c| c.get(i));
58                map.insert((*name).to_string(), scalar_to_json(val));
59            }
60            rows.push(Value::Object(map));
61        }
62        rows
63    }
64
65    /// Parse a JSON Value (expected array of objects) into a DataFrame.
66    fn from_json_value(val: &Value) -> Result<Self, DataFrameError> {
67        let arr = match val {
68            Value::Array(a) => a,
69            Value::Null
70            | Value::Bool(_)
71            | Value::Number(_)
72            | Value::String(_)
73            | Value::Object(_) => {
74                return Err(DataFrameError::Other(
75                    "expected JSON array of objects".to_string(),
76                ));
77            }
78        };
79
80        if arr.is_empty() {
81            return Ok(Self::empty());
82        }
83
84        // Discover column names from the first object; arr is non-empty (checked above)
85        #[allow(
86            clippy::indexing_slicing,
87            reason = "arr is non-empty (is_empty() guard above); index 0 is always valid"
88        )]
89        let first = match &arr[0] {
90            Value::Object(m) => m,
91            Value::Null
92            | Value::Bool(_)
93            | Value::Number(_)
94            | Value::String(_)
95            | Value::Array(_) => {
96                return Err(DataFrameError::Other(
97                    "expected JSON object as array element".to_string(),
98                ));
99            }
100        };
101
102        let col_names: Vec<String> = first.keys().cloned().collect();
103        let n_rows = arr.len();
104
105        // Collect raw values per column
106        let mut raw_cols: Vec<Vec<Option<&Value>>> = col_names
107            .iter()
108            .map(|_| Vec::with_capacity(n_rows))
109            .collect();
110
111        for row_val in arr {
112            let obj = match row_val {
113                Value::Object(m) => m,
114                Value::Null
115                | Value::Bool(_)
116                | Value::Number(_)
117                | Value::String(_)
118                | Value::Array(_) => {
119                    return Err(DataFrameError::Other(
120                        "expected JSON object as array element".to_string(),
121                    ));
122                }
123            };
124            for (col_idx, name) in col_names.iter().enumerate() {
125                // col_idx < col_names.len() == raw_cols.len(); always valid
126                #[allow(
127                    clippy::indexing_slicing,
128                    reason = "col_idx iterates 0..col_names.len() which equals raw_cols.len(); index is always in bounds"
129                )]
130                raw_cols[col_idx].push(obj.get(name));
131            }
132        }
133
134        // Infer types and build columns
135        let columns: Vec<Column> = col_names
136            .into_iter()
137            .zip(raw_cols)
138            .map(|(name, vals)| infer_column(&name, &vals))
139            .collect();
140
141        DataFrame::new(columns)
142    }
143}
144
145/// Convert a Scalar to a JSON Value.
146fn scalar_to_json(val: Option<Scalar>) -> Value {
147    match val {
148        None | Some(Scalar::Null) => Value::Null,
149        Some(Scalar::Bool(b)) => Value::Bool(b),
150        Some(Scalar::Int64(n)) => Value::Number(n.into()),
151        Some(Scalar::UInt64(n)) => Value::Number(n.into()),
152        Some(Scalar::Float64(f)) => {
153            serde_json::Number::from_f64(f).map_or(Value::Null, Value::Number)
154        }
155        Some(Scalar::String(s)) => Value::String(s),
156    }
157}
158
159/// Infer a Column's type from JSON values.
160fn infer_column(name: &str, vals: &[Option<&Value>]) -> Column {
161    // Find first non-null value to determine type
162    let first_non_null = vals.iter().find_map(|v| match v {
163        Some(Value::Null) | None => None,
164        Some(inner) => Some(*inner),
165    });
166
167    match first_non_null {
168        Some(Value::Bool(_)) => {
169            let data: Vec<Option<bool>> = vals
170                .iter()
171                .map(|v| match v {
172                    Some(Value::Bool(b)) => Some(*b),
173                    Some(
174                        Value::Null
175                        | Value::Number(_)
176                        | Value::String(_)
177                        | Value::Array(_)
178                        | Value::Object(_),
179                    )
180                    | None => None,
181                })
182                .collect();
183            Column::new_bool(name, data)
184        }
185        Some(Value::Number(first_num)) => {
186            // If any value is fractional, use f64
187            let has_float = vals.iter().any(
188                |v| matches!(v, Some(Value::Number(num)) if num.is_f64() && num.as_i64().is_none()),
189            );
190            if has_float {
191                let data: Vec<Option<f64>> = vals
192                    .iter()
193                    .map(|v| match v {
194                        Some(Value::Number(num)) => num.as_f64(),
195                        Some(
196                            Value::Null
197                            | Value::Bool(_)
198                            | Value::String(_)
199                            | Value::Array(_)
200                            | Value::Object(_),
201                        )
202                        | None => None,
203                    })
204                    .collect();
205                Column::new_f64(name, data)
206            } else if first_num.is_u64() && first_num.as_i64().is_none() {
207                // Pure u64 (exceeds i64 range)
208                let data: Vec<Option<u64>> = vals
209                    .iter()
210                    .map(|v| match v {
211                        Some(Value::Number(num)) => num.as_u64(),
212                        Some(
213                            Value::Null
214                            | Value::Bool(_)
215                            | Value::String(_)
216                            | Value::Array(_)
217                            | Value::Object(_),
218                        )
219                        | None => None,
220                    })
221                    .collect();
222                Column::new_u64(name, data)
223            } else {
224                let data: Vec<Option<i64>> = vals
225                    .iter()
226                    .map(|v| match v {
227                        Some(Value::Number(num)) => num.as_i64(),
228                        Some(
229                            Value::Null
230                            | Value::Bool(_)
231                            | Value::String(_)
232                            | Value::Array(_)
233                            | Value::Object(_),
234                        )
235                        | None => None,
236                    })
237                    .collect();
238                Column::new_i64(name, data)
239            }
240        }
241        // String, Array, Object, or no non-null values → fall back to string column
242        Some(Value::String(_) | Value::Array(_) | Value::Object(_) | Value::Null) | None => {
243            let data: Vec<Option<String>> = vals
244                .iter()
245                .map(|v| match v {
246                    Some(Value::String(s)) => Some(s.clone()),
247                    Some(Value::Null) | None => None,
248                    Some(other) => Some(other.to_string()),
249                })
250                .collect();
251            Column::new_string(name, data)
252        }
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259
260    #[test]
261    fn roundtrip_json_string() {
262        let df = DataFrame::new(vec![
263            Column::from_strs("name", &["alice", "bob"]),
264            Column::from_i64s("age", vec![30, 25]),
265        ])
266        .unwrap_or_else(|_| unreachable!());
267
268        let json = df.to_json().unwrap_or_else(|_| unreachable!());
269        let df2 = DataFrame::from_json(&json).unwrap_or_else(|_| unreachable!());
270
271        assert_eq!(df2.height(), 2);
272        assert_eq!(df2.width(), 2);
273    }
274
275    #[test]
276    fn from_json_mixed_types() {
277        let json = r#"[
278            {"x": 1, "y": "hello", "z": true},
279            {"x": 2, "y": "world", "z": false}
280        ]"#;
281        let df = DataFrame::from_json(json).unwrap_or_else(|_| unreachable!());
282        assert_eq!(df.height(), 2);
283        assert_eq!(df.width(), 3);
284    }
285
286    #[test]
287    fn from_json_with_nulls() {
288        let json = r#"[
289            {"x": 1, "y": "a"},
290            {"x": null, "y": "b"},
291            {"x": 3, "y": null}
292        ]"#;
293        let df = DataFrame::from_json(json).unwrap_or_else(|_| unreachable!());
294        assert_eq!(df.height(), 3);
295
296        let x = df.column("x").unwrap_or_else(|_| unreachable!());
297        assert_eq!(x.get(0), Some(Scalar::Int64(1)));
298        assert_eq!(x.get(1), Some(Scalar::Null));
299        assert_eq!(x.get(2), Some(Scalar::Int64(3)));
300    }
301
302    #[test]
303    fn from_json_empty_array() {
304        let df = DataFrame::from_json("[]").unwrap_or_else(|_| unreachable!());
305        assert!(df.is_empty());
306    }
307
308    #[test]
309    fn from_json_invalid() {
310        assert!(DataFrame::from_json("not json").is_err());
311        assert!(DataFrame::from_json("42").is_err());
312    }
313
314    #[test]
315    fn from_json_floats() {
316        let json = r#"[{"val": 1.5}, {"val": 2.5}]"#;
317        let df = DataFrame::from_json(json).unwrap_or_else(|_| unreachable!());
318        let col = df.column("val").unwrap_or_else(|_| unreachable!());
319        assert_eq!(col.get(0), Some(Scalar::Float64(1.5)));
320    }
321
322    #[test]
323    fn roundtrip_json_file() {
324        let dir = tempfile::tempdir().unwrap_or_else(|_| unreachable!());
325        let path = dir.path().join("test.json");
326
327        let df = DataFrame::new(vec![
328            Column::from_strs("drug", &["asp", "met"]),
329            Column::from_i64s("n", vec![100, 200]),
330        ])
331        .unwrap_or_else(|_| unreachable!());
332
333        df.to_json_file(&path).unwrap_or_else(|_| unreachable!());
334
335        let file = std::fs::File::open(&path).unwrap_or_else(|_| unreachable!());
336        let reader = std::io::BufReader::new(file);
337        let df2 = DataFrame::from_json_reader(reader).unwrap_or_else(|_| unreachable!());
338
339        assert_eq!(df2.height(), 2);
340    }
341}