base_d/encoders/algorithms/schema/
fiche_analyzer.rs

1use serde_json::Value;
2
3/// Detected fiche mode based on JSON structure
4#[derive(Debug, Clone, Copy)]
5pub enum DetectedMode {
6    Full,
7    Path,
8}
9
10/// Auto-detect the best fiche mode for the given JSON structure
11pub fn detect_fiche_mode(json: &str) -> DetectedMode {
12    let value: Value = match serde_json::from_str(json) {
13        Ok(v) => v,
14        Err(_) => return DetectedMode::Full, // Default on parse failure
15    };
16
17    let analysis = analyze_structure(&value, 0);
18
19    // Decision heuristics:
20    // 1. If root is homogeneous array of objects → Full (tabular data)
21    // 2. If varying structure in arrays → Path (schema explosion risk)
22    // 3. If deep nesting (>4 levels) + has indexed arrays → Path
23    // 4. If schema explosion detected (>50 unique paths) → Path
24    // 5. Default → Full
25
26    if is_homogeneous_array(&value) {
27        return DetectedMode::Full;
28    }
29
30    // Varying structure is a strong signal for path mode
31    if analysis.has_varying_array_structure {
32        return DetectedMode::Path;
33    }
34
35    // Deep nesting (>3 levels) suggests complex structure better suited for path mode
36    if analysis.max_depth > 3 && analysis.has_indexed_arrays {
37        return DetectedMode::Path;
38    }
39
40    // Lower threshold for path explosion
41    if analysis.unique_paths > 50 {
42        return DetectedMode::Path;
43    }
44
45    // Default to full for typical structured data
46    DetectedMode::Full
47}
48
49#[derive(Default)]
50struct StructureAnalysis {
51    max_depth: usize,
52    unique_paths: usize,
53    has_indexed_arrays: bool,
54    has_varying_array_structure: bool,
55}
56
57fn analyze_structure(value: &Value, depth: usize) -> StructureAnalysis {
58    let mut analysis = StructureAnalysis {
59        max_depth: depth,
60        ..Default::default()
61    };
62
63    match value {
64        Value::Object(map) => {
65            let mut paths = 0;
66            for (_, v) in map {
67                let child = analyze_structure(v, depth + 1);
68                analysis.max_depth = analysis.max_depth.max(child.max_depth);
69                paths += child.unique_paths.max(1);
70                analysis.has_indexed_arrays |= child.has_indexed_arrays;
71                analysis.has_varying_array_structure |= child.has_varying_array_structure;
72            }
73            analysis.unique_paths = paths;
74        }
75        Value::Array(arr) => {
76            if arr.is_empty() {
77                analysis.unique_paths = 1;
78                return analysis;
79            }
80
81            // Check for homogeneity in arrays
82            let first_type = type_signature(&arr[0]);
83            let mut max_child_analysis = StructureAnalysis::default();
84
85            // For arrays of objects, check key consistency
86            if matches!(arr[0], Value::Object(_)) {
87                let first_keys = if let Value::Object(map) = &arr[0] {
88                    map.keys().collect::<Vec<_>>()
89                } else {
90                    vec![]
91                };
92
93                for item in arr.iter().skip(1) {
94                    if type_signature(item) != first_type {
95                        analysis.has_varying_array_structure = true;
96                    }
97
98                    // Check if keys match
99                    if let Value::Object(map) = item {
100                        let keys = map.keys().collect::<Vec<_>>();
101                        if keys != first_keys {
102                            analysis.has_varying_array_structure = true;
103                        }
104                    }
105                }
106            } else {
107                // For non-object arrays, just check type
108                for item in arr.iter().skip(1) {
109                    if type_signature(item) != first_type {
110                        analysis.has_varying_array_structure = true;
111                    }
112                }
113            }
114
115            // Analyze all children
116            for item in arr {
117                let child = analyze_structure(item, depth + 1);
118                max_child_analysis.max_depth = max_child_analysis.max_depth.max(child.max_depth);
119                max_child_analysis.unique_paths =
120                    max_child_analysis.unique_paths.max(child.unique_paths);
121                max_child_analysis.has_indexed_arrays |= child.has_indexed_arrays;
122                max_child_analysis.has_varying_array_structure |= child.has_varying_array_structure;
123            }
124
125            // Any array of objects creates indexed paths
126            if matches!(arr[0], Value::Object(_)) {
127                analysis.has_indexed_arrays = true;
128            }
129
130            // If array of objects → paths multiply by array length
131            if matches!(arr[0], Value::Object(_)) {
132                analysis.unique_paths = max_child_analysis.unique_paths * arr.len();
133            } else {
134                analysis.unique_paths = 1;
135            }
136
137            analysis.max_depth = max_child_analysis.max_depth;
138            analysis.has_indexed_arrays |= max_child_analysis.has_indexed_arrays;
139            analysis.has_varying_array_structure |= max_child_analysis.has_varying_array_structure;
140        }
141        _ => {
142            analysis.unique_paths = 1;
143        }
144    }
145
146    analysis
147}
148
149fn type_signature(value: &Value) -> &str {
150    match value {
151        Value::Null => "null",
152        Value::Bool(_) => "bool",
153        Value::Number(_) => "number",
154        Value::String(_) => "string",
155        Value::Array(_) => "array",
156        Value::Object(_) => "object",
157    }
158}
159
160fn is_homogeneous_array(value: &Value) -> bool {
161    match value {
162        Value::Array(arr) => {
163            if arr.is_empty() {
164                return false;
165            }
166
167            // Check if all elements are objects with same keys
168            let first = match &arr[0] {
169                Value::Object(map) => map,
170                _ => return false,
171            };
172
173            let first_keys: Vec<_> = first.keys().collect();
174
175            for item in arr.iter().skip(1) {
176                match item {
177                    Value::Object(map) => {
178                        let keys: Vec<_> = map.keys().collect();
179                        if keys.len() != first_keys.len() {
180                            return false;
181                        }
182                        for key in &first_keys {
183                            if !keys.contains(key) {
184                                return false;
185                            }
186                        }
187                    }
188                    _ => return false,
189                }
190            }
191            true
192        }
193        Value::Object(map) => {
194            // Check for wrapper keys like "results", "data", etc.
195            if map.len() == 1 {
196                for (key, value) in map {
197                    if matches!(
198                        key.as_str(),
199                        "results" | "data" | "items" | "records" | "rows"
200                    ) {
201                        return is_homogeneous_array(value);
202                    }
203                }
204            }
205            false
206        }
207        _ => false,
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_detect_homogeneous_array() {
217        let json = r#"[{"name":"alice"},{"name":"bob"}]"#;
218        let mode = detect_fiche_mode(json);
219        assert!(matches!(mode, DetectedMode::Full));
220    }
221
222    #[test]
223    fn test_detect_deep_nested() {
224        // Depth: a(1) → b(2) → c(3) → d(4) → e(5) → array items(6)
225        // Should trigger path mode due to depth > 4 + indexed arrays
226        let json = r#"{"a":{"b":{"c":{"d":{"e":[{"f":1},{"f":2}]}}}}}"#;
227        let mode = detect_fiche_mode(json);
228        assert!(matches!(mode, DetectedMode::Path));
229    }
230
231    #[test]
232    fn test_detect_varying_structure() {
233        let json = r#"{"items":[{"type":"a","x":1},{"type":"b","y":2}]}"#;
234        let mode = detect_fiche_mode(json);
235        // Should detect varying structure
236        assert!(matches!(mode, DetectedMode::Path));
237    }
238
239    #[test]
240    fn test_detect_simple_object() {
241        let json = r#"{"id":1,"name":"alice"}"#;
242        let mode = detect_fiche_mode(json);
243        assert!(matches!(mode, DetectedMode::Full));
244    }
245
246    #[test]
247    fn test_detect_wrapper_key() {
248        let json = r#"{"results":[{"id":1},{"id":2}]}"#;
249        let mode = detect_fiche_mode(json);
250        assert!(matches!(mode, DetectedMode::Full));
251    }
252}