base_d/encoders/algorithms/schema/
stele_analyzer.rs

1use serde_json::Value;
2
3/// Detected stele mode based on JSON structure
4#[derive(Debug, Clone, Copy)]
5pub enum DetectedMode {
6    Full,
7    Path,
8}
9
10/// Auto-detect the best stele mode for the given JSON structure
11pub fn detect_stele_mode(json: &str) -> DetectedMode {
12    let value: Value = match serde_json::from_str(json) {
13        Ok(v) => v,
14        Err(_) => return DetectedMode::Full, // Default on parse failure
15    };
16
17    let analysis = analyze_structure(&value, 0);
18
19    // Decision heuristics:
20    // 1. If schema explosion detected (>50 unique paths) → Path (most reliable signal)
21    // 2. If deep nesting (>3 levels) + has indexed arrays → Path
22    // 3. If varying structure in arrays → Path (schema explosion risk)
23    // 4. If root is homogeneous array of objects → Full (tabular data)
24    // 5. Default → Full
25
26    // Check schema explosion FIRST - this is the most reliable signal
27    if analysis.unique_paths > 50 {
28        return DetectedMode::Path;
29    }
30
31    // Deep nesting with indexed arrays
32    if analysis.max_depth > 3 && analysis.has_indexed_arrays {
33        return DetectedMode::Path;
34    }
35
36    // Varying structure is a strong signal for path mode
37    if analysis.has_varying_array_structure {
38        return DetectedMode::Path;
39    }
40
41    // Homogeneous arrays work well with full mode
42    if is_homogeneous_array(&value) {
43        return DetectedMode::Full;
44    }
45
46    // Default to full for typical structured data
47    DetectedMode::Full
48}
49
50#[derive(Default)]
51struct StructureAnalysis {
52    max_depth: usize,
53    unique_paths: usize,
54    has_indexed_arrays: bool,
55    has_varying_array_structure: bool,
56}
57
58fn analyze_structure(value: &Value, depth: usize) -> StructureAnalysis {
59    let mut analysis = StructureAnalysis {
60        max_depth: depth,
61        ..Default::default()
62    };
63
64    match value {
65        Value::Object(map) => {
66            let mut paths = 0;
67            for (_, v) in map {
68                let child = analyze_structure(v, depth + 1);
69                analysis.max_depth = analysis.max_depth.max(child.max_depth);
70                paths += child.unique_paths.max(1);
71                analysis.has_indexed_arrays |= child.has_indexed_arrays;
72                analysis.has_varying_array_structure |= child.has_varying_array_structure;
73            }
74            analysis.unique_paths = paths;
75        }
76        Value::Array(arr) => {
77            if arr.is_empty() {
78                analysis.unique_paths = 1;
79                return analysis;
80            }
81
82            // Check for homogeneity in arrays
83            let first_type = type_signature(&arr[0]);
84            let mut max_child_analysis = StructureAnalysis::default();
85
86            // For arrays of objects, check key consistency
87            if matches!(arr[0], Value::Object(_)) {
88                let first_keys = if let Value::Object(map) = &arr[0] {
89                    map.keys().collect::<Vec<_>>()
90                } else {
91                    vec![]
92                };
93
94                for item in arr.iter().skip(1) {
95                    if type_signature(item) != first_type {
96                        analysis.has_varying_array_structure = true;
97                    }
98
99                    // Check if keys match
100                    if let Value::Object(map) = item {
101                        let keys = map.keys().collect::<Vec<_>>();
102                        if keys != first_keys {
103                            analysis.has_varying_array_structure = true;
104                        }
105                    }
106                }
107            } else {
108                // For non-object arrays, just check type
109                for item in arr.iter().skip(1) {
110                    if type_signature(item) != first_type {
111                        analysis.has_varying_array_structure = true;
112                    }
113                }
114            }
115
116            // Analyze all children
117            for item in arr {
118                let child = analyze_structure(item, depth + 1);
119                max_child_analysis.max_depth = max_child_analysis.max_depth.max(child.max_depth);
120                max_child_analysis.unique_paths =
121                    max_child_analysis.unique_paths.max(child.unique_paths);
122                max_child_analysis.has_indexed_arrays |= child.has_indexed_arrays;
123                max_child_analysis.has_varying_array_structure |= child.has_varying_array_structure;
124            }
125
126            // Any array of objects creates indexed paths
127            if matches!(arr[0], Value::Object(_)) {
128                analysis.has_indexed_arrays = true;
129            }
130
131            // If array of objects → paths multiply by array length
132            if matches!(arr[0], Value::Object(_)) {
133                analysis.unique_paths = max_child_analysis.unique_paths * arr.len();
134            } else {
135                analysis.unique_paths = 1;
136            }
137
138            analysis.max_depth = max_child_analysis.max_depth;
139            analysis.has_indexed_arrays |= max_child_analysis.has_indexed_arrays;
140            analysis.has_varying_array_structure |= max_child_analysis.has_varying_array_structure;
141        }
142        _ => {
143            analysis.unique_paths = 1;
144        }
145    }
146
147    analysis
148}
149
150fn type_signature(value: &Value) -> &str {
151    match value {
152        Value::Null => "null",
153        Value::Bool(_) => "bool",
154        Value::Number(_) => "number",
155        Value::String(_) => "string",
156        Value::Array(_) => "array",
157        Value::Object(_) => "object",
158    }
159}
160
161fn is_homogeneous_array(value: &Value) -> bool {
162    match value {
163        Value::Array(arr) => {
164            if arr.is_empty() {
165                return false;
166            }
167
168            // Check if all elements are objects with same keys
169            let first = match &arr[0] {
170                Value::Object(map) => map,
171                _ => return false,
172            };
173
174            let first_keys: Vec<_> = first.keys().collect();
175
176            for item in arr.iter().skip(1) {
177                match item {
178                    Value::Object(map) => {
179                        let keys: Vec<_> = map.keys().collect();
180                        if keys.len() != first_keys.len() {
181                            return false;
182                        }
183                        for key in &first_keys {
184                            if !keys.contains(key) {
185                                return false;
186                            }
187                        }
188                    }
189                    _ => return false,
190                }
191            }
192            true
193        }
194        Value::Object(map) => {
195            // Check for wrapper keys like "results", "data", etc.
196            if map.len() == 1 {
197                for (key, value) in map {
198                    if matches!(
199                        key.as_str(),
200                        "results" | "data" | "items" | "records" | "rows"
201                    ) {
202                        return is_homogeneous_array(value);
203                    }
204                }
205            }
206            false
207        }
208        _ => false,
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn test_detect_homogeneous_array() {
218        let json = r#"[{"name":"alice"},{"name":"bob"}]"#;
219        let mode = detect_stele_mode(json);
220        assert!(matches!(mode, DetectedMode::Full));
221    }
222
223    #[test]
224    fn test_detect_deep_nested() {
225        // Depth: a(1) → b(2) → c(3) → d(4) → e(5) → array items(6)
226        // Should trigger path mode due to depth > 4 + indexed arrays
227        let json = r#"{"a":{"b":{"c":{"d":{"e":[{"f":1},{"f":2}]}}}}}"#;
228        let mode = detect_stele_mode(json);
229        assert!(matches!(mode, DetectedMode::Path));
230    }
231
232    #[test]
233    fn test_detect_varying_structure() {
234        let json = r#"{"items":[{"type":"a","x":1},{"type":"b","y":2}]}"#;
235        let mode = detect_stele_mode(json);
236        // Should detect varying structure
237        assert!(matches!(mode, DetectedMode::Path));
238    }
239
240    #[test]
241    fn test_detect_simple_object() {
242        let json = r#"{"id":1,"name":"alice"}"#;
243        let mode = detect_stele_mode(json);
244        assert!(matches!(mode, DetectedMode::Full));
245    }
246
247    #[test]
248    fn test_detect_wrapper_key() {
249        let json = r#"{"results":[{"id":1},{"id":2}]}"#;
250        let mode = detect_stele_mode(json);
251        assert!(matches!(mode, DetectedMode::Full));
252    }
253}