zoon_format/
encode.rs

1use crate::{Result, ZoonError};
2use serde::Serialize;
3use std::collections::{BTreeMap, HashMap, HashSet};
4
5pub fn encode<T: Serialize>(value: &T) -> Result<String> {
6    let json_value = serde_json::to_value(value).map_err(|e| ZoonError::UnsupportedType(e.to_string()))?;
7    encode_value(&json_value)
8}
9
10fn encode_value(value: &serde_json::Value) -> Result<String> {
11    match value {
12        serde_json::Value::Array(arr) => encode_tabular(arr),
13        serde_json::Value::Object(obj) => encode_inline(obj),
14        _ => Err(ZoonError::InvalidFormat("top level must be object or array".into())),
15    }
16}
17
18fn flatten_object(
19    prefix: &str,
20    value: &serde_json::Value,
21    result: &mut BTreeMap<String, serde_json::Value>,
22) {
23    if let serde_json::Value::Object(obj) = value {
24        for (k, v) in obj {
25            let new_key = if prefix.is_empty() {
26                k.clone()
27            } else {
28                format!("{}.{}", prefix, k)
29            };
30            flatten_object(&new_key, v, result);
31        }
32    } else {
33        result.insert(prefix.to_string(), value.clone());
34    }
35}
36
37fn detect_aliases(keys: &[String]) -> HashMap<String, String> {
38    let mut prefix_counts: HashMap<String, usize> = HashMap::new();
39    
40    for key in keys {
41        let parts: Vec<&str> = key.split('.').collect();
42        if parts.len() > 1 {
43            for i in 1..parts.len() {
44                let prefix = parts[..i].join(".");
45                *prefix_counts.entry(prefix).or_insert(0) += 1;
46            }
47        }
48    }
49
50    let mut savings: Vec<(String, isize)> = Vec::new();
51
52    for (prefix, count) in &prefix_counts {
53        let prefix_len = prefix.len() as isize;
54        let count = *count as isize;
55        // Formula: (prefix - 2) * count - (prefix + 4)
56        let score = (prefix_len - 2) * count - (prefix_len + 4);
57        if score > 0 {
58            savings.push((prefix.clone(), score));
59        }
60    }
61    
62    savings.sort_by(|a, b| b.1.cmp(&a.1));
63    
64    let mut aliases = HashMap::new();
65    let mut used_aliases = HashSet::new();
66    let mut alias_idx = 0;
67    
68    for (prefix, _) in savings {
69        // Simple alias assignment
70        let parts: Vec<&str> = prefix.split('.').collect();
71        let name = parts.last().unwrap();
72        let mut candidate = name.chars().next().unwrap().to_lowercase().to_string();
73        
74        // Find valid alias
75        loop {
76            if !used_aliases.contains(&candidate) {
77                break;
78            }
79            if candidate.len() == 1 {
80                // a, b, c...
81                let c = (b'a' + alias_idx) as char;
82                candidate = c.to_string();
83                alias_idx += 1;
84                if alias_idx > 25 { break; } // safety
85            } else {
86                // give up or try something else?
87                break;
88            }
89        }
90        
91        if !used_aliases.contains(&candidate) {
92            aliases.insert(prefix, candidate.clone());
93            used_aliases.insert(candidate);
94        }
95        if aliases.len() >= 10 { break; }
96    }
97    
98    aliases
99}
100
101fn apply_alias(name: &str, aliases: &HashMap<String, String>) -> String {
102    for (prefix, alias) in aliases {
103        if name == prefix {
104            return format!("%{}", alias);
105        }
106        if name.starts_with(&format!("{}.", prefix)) {
107            return format!("%{}.{}", alias, &name[prefix.len() + 1..]);
108        }
109    }
110    name.to_string()
111}
112
113fn encode_tabular(arr: &[serde_json::Value]) -> Result<String> {
114    if arr.is_empty() {
115        return Ok(String::new());
116    }
117
118    // 1. Flatten
119    let mut flattened_rows = Vec::new();
120    let mut all_keys_set = HashSet::new();
121
122    for item in arr {
123        let mut flat_map = BTreeMap::new();
124        flatten_object("", item, &mut flat_map);
125        for k in flat_map.keys() {
126            all_keys_set.insert(k.clone());
127        }
128        flattened_rows.push(flat_map);
129    }
130    
131    let mut all_keys: Vec<String> = all_keys_set.into_iter().collect();
132    all_keys.sort();
133    
134    // 2. Constants
135    let mut constants = BTreeMap::new();
136    let mut active_keys = Vec::new();
137    
138    if arr.len() > 1 {
139        for key in &all_keys {
140            let first_val = flattened_rows[0].get(key).unwrap_or(&serde_json::Value::Null);
141            let mut is_const = true;
142            for row in &flattened_rows {
143                let val = row.get(key).unwrap_or(&serde_json::Value::Null);
144                if val != first_val {
145                    is_const = false;
146                    break;
147                }
148            }
149            if is_const && !first_val.is_null() {
150                constants.insert(key.clone(), first_val.clone());
151            } else {
152                active_keys.push(key.clone());
153            }
154        }
155    } else {
156        active_keys = all_keys;
157    }
158    
159    // 3. Aliases
160    let aliases = detect_aliases(&active_keys);
161
162    // 4. Stats
163    let mut stats: BTreeMap<String, ColumnStats> = BTreeMap::new();
164    for key in &active_keys {
165        stats.insert(key.clone(), ColumnStats::default());
166    }
167    
168    for row in &flattened_rows {
169        for key in &active_keys {
170            let val = row.get(key).unwrap_or(&serde_json::Value::Null);
171            let s = serialize_value(val);
172            let stat = stats.get_mut(key).unwrap();
173            
174            stat.values.push(s.clone());
175            stat.unique_vals.insert(s);
176            
177            // Guess is likely ID (simplified logic)
178            if key.to_lowercase() == "id" {
179                if let serde_json::Value::Number(_) = val {
180                    stat.is_seq = true;
181                }
182            }
183        }
184    }
185    
186    // Build Header
187    let mut header_parts = vec!["#".to_string()];
188    
189    // Alias definitions
190    let mut alias_defs: Vec<String> = Vec::new();
191    for (prefix, alias) in &aliases {
192        alias_defs.push(format!("%{}={}", alias, prefix));
193    }
194    alias_defs.sort(); // Deterministic
195    
196    // Combine alias defs into lines before header?
197    // Spec: "%a=... # ..." or separate lines.
198    // ZOON 1.1 spec usually puts alias defs on new line or same line.
199    // My Go/Python impl use separate lines if aliases exist.
200    // But function returns one String.
201    
202    let mut lines = Vec::new();
203    if !alias_defs.is_empty() {
204        lines.push(alias_defs.join(" "));
205    }
206
207    // Constants
208    for (k, v) in &constants {
209        let aliased = apply_alias(k, &aliases).replace(" ", "_");
210        let s_val = serialize_value(v);
211        let mut type_code = ":";
212        
213        if let serde_json::Value::String(_) = v {
214            type_code = "=";
215        } else if let serde_json::Value::Bool(b) = v {
216             if *b {
217                 // s_val is "1", but for constant display "y" is nicer or standard?
218                 // Spec: @active:y or @active:n
219                 // serialize_value returns "1" or "0".
220                 // We should fix this.
221             }
222        }
223        
224        let display_val = if let serde_json::Value::Bool(b) = v {
225            if *b { "y".to_string() } else { "n".to_string() }
226        } else {
227            s_val
228        };
229        
230        let sep = if let serde_json::Value::String(_) = v { "=" } else { ":" };
231        header_parts.push(format!("@{}{}{}", aliased, sep, display_val));
232    }
233
234    let mut skip_indices = HashSet::new();
235    
236    for (i, key) in active_keys.iter().enumerate() {
237        let stat = stats.get(key).unwrap();
238        let aliased = apply_alias(key, &aliases).replace(" ", "_");
239        let type_code = infer_type(stat, arr.len(), key);
240        
241        if type_code == "i+" {
242            skip_indices.insert(i);
243        }
244        
245        if type_code.starts_with('=') || type_code.starts_with('!') {
246            header_parts.push(format!("{}{}", aliased, type_code));
247        } else {
248            header_parts.push(format!("{}:{}", aliased, type_code));
249        }
250    }
251
252    // +N Check
253    // Check if all active columns are skipped (i+)
254    let mut all_skipped = true;
255    if active_keys.is_empty() {
256        // all implicit
257    } else {
258        for i in 0..active_keys.len() {
259            if !skip_indices.contains(&i) {
260                all_skipped = false;
261                break;
262            }
263        }
264    }
265    
266    if all_skipped && !arr.is_empty() {
267        header_parts.push(format!("+{}", arr.len()));
268    }
269
270    lines.push(header_parts.join(" "));
271    
272    if all_skipped {
273        return Ok(lines.join("\n"));
274    }
275    
276    // Rows
277    for row in &flattened_rows {
278        let mut out_row = Vec::new();
279        for (i, key) in active_keys.iter().enumerate() {
280            if skip_indices.contains(&i) { continue; }
281            
282            let val = row.get(key).unwrap_or(&serde_json::Value::Null);
283            let mut s = serialize_value(val);
284            
285            let stat = stats.get(key).unwrap();
286            let type_code = infer_type(stat, arr.len(), key);
287            
288            if type_code == "b" {
289                 if s == "true" { s = "1".into(); }
290                 else if s == "false" { s = "0".into(); }
291            } else if type_code == "t" {
292                if let serde_json::Value::String(raw) = val {
293                    s = format!("\"{}\"", raw.replace('"', "\\\""));
294                }
295            }
296            out_row.push(s);
297        }
298        lines.push(out_row.join(" "));
299    }
300
301    Ok(lines.join("\n"))
302}
303
304fn encode_inline(obj: &serde_json::Map<String, serde_json::Value>) -> Result<String> {
305    let parts: Vec<String> = obj.iter().map(|(k, v)| format_inline_pair(k, v)).collect();
306    Ok(parts.join(" "))
307}
308
309fn format_inline_pair(key: &str, value: &serde_json::Value) -> String {
310    match value {
311        serde_json::Value::String(s) => format!("{}={}", key, s.replace(' ', "_")),
312        serde_json::Value::Bool(b) => format!("{}:{}", key, if *b { "y" } else { "n" }),
313        serde_json::Value::Number(n) => format!("{}:{}", key, n),
314        serde_json::Value::Null => format!("{}:~", key),
315        serde_json::Value::Object(obj) => {
316            let inner = encode_inline(obj).unwrap_or_default();
317            format!("{}:{{{}}}", key, inner)
318        }
319        serde_json::Value::Array(_) => format!("{}:[...]", key),
320    }
321}
322
323fn serialize_value(value: &serde_json::Value) -> String {
324    match value {
325        serde_json::Value::String(s) => s.replace(' ', "_"),
326        serde_json::Value::Number(n) => n.to_string(),
327        serde_json::Value::Bool(b) => if *b { "1".into() } else { "0".into() },
328        serde_json::Value::Null => "~".into(),
329        serde_json::Value::Object(obj) => {
330            let inner = encode_inline(obj).unwrap_or_default();
331            format!("{{{}}}", inner)
332        }
333        serde_json::Value::Array(_) => "[...]".into(),
334    }
335}
336
337#[derive(Default)]
338struct ColumnStats {
339    values: Vec<String>,
340    unique_vals: std::collections::HashSet<String>,
341    is_seq: bool,
342    indexed: bool,
343    enum_keys: Vec<String>,
344    is_text: bool,
345}
346
347fn infer_type(stat: &ColumnStats, arr_len: usize, key: &str) -> String {
348    if key.to_lowercase() == "id" && stat.is_seq && check_sequence(&stat.values) {
349        return "i+".into();
350    }
351
352    let all_nums = stat.values.iter().all(|v| v.parse::<i64>().is_ok() || v == "~");
353    if all_nums && !stat.values.iter().all(|v| v == "~") {
354        return "i".into();
355    }
356
357    let all_bools = stat.values.iter().all(|v| v == "0" || v == "1" || v == "~");
358    if all_bools {
359        return "b".into();
360    }
361
362    if stat.unique_vals.len() <= 10 && stat.unique_vals.len() < arr_len {
363        let mut vals: Vec<_> = stat.unique_vals.iter().filter(|v| *v != "~").cloned().collect();
364        vals.sort();
365        if !vals.is_empty() {
366            if vals.len() >= 3 {
367                let avg_len: usize = vals.iter().map(|v| v.len()).sum::<usize>() / vals.len();
368                let literal_cost = avg_len * arr_len;
369                let index_cost = vals.join("|").len() + arr_len * 2;
370                if literal_cost > index_cost {
371                    return format!("!{}", vals.join("|"));
372                }
373            }
374            return format!("={}", vals.join("|"));
375        }
376    }
377
378    let total_len: usize = stat.values.iter().map(|v| v.len()).sum();
379    if !stat.values.is_empty() && total_len / stat.values.len() > 30 {
380        return "t".into();
381    }
382
383    "s".into()
384}
385
386fn check_sequence(values: &[String]) -> bool {
387    for (i, v) in values.iter().enumerate() {
388        if v != &(i + 1).to_string() {
389            return false;
390        }
391    }
392    true
393}