json_packer/
pool.rs

1use std::collections::HashMap;
2
3use serde_json::Value;
4
5use crate::{bitstream::BitWriter, varint, types::tag};
6
7#[derive(Debug, Clone)]
8pub struct StringPool {
9    // id -> value
10    pub entries: Vec<String>,
11    // value -> id
12    pub index: HashMap<String, u64>,
13}
14
15#[derive(Debug, Clone, Copy)]
16pub struct PoolConfig {
17    pub min_repeats: u32,
18    pub min_string_len: usize,
19}
20
21impl Default for PoolConfig {
22    fn default() -> Self { Self { min_repeats: 3, min_string_len: 8 } }
23}
24
25pub fn collect_string_pool(root: &Value, cfg: PoolConfig) -> StringPool {
26    let mut counter: HashMap<String, u32> = HashMap::new();
27    fn walk(v: &Value, counter: &mut HashMap<String, u32>) {
28        match v {
29            Value::String(s) => {
30                *counter.entry(s.clone()).or_insert(0) += 1;
31            }
32            Value::Array(a) => for x in a { walk(x, counter); },
33            Value::Object(m) => for (_k, x) in m { walk(x, counter); },
34            _ => {}
35        }
36    }
37    walk(root, &mut counter);
38
39    // 过滤并排序:频次降序,其次字节序升序,确保确定性
40    let mut candidates: Vec<(String, u32)> = counter
41        .into_iter()
42        .filter(|(s, c)| *c >= cfg.min_repeats && s.len() >= cfg.min_string_len)
43        .collect();
44    candidates.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
45
46    let mut entries: Vec<String> = Vec::with_capacity(candidates.len());
47    let mut index: HashMap<String, u64> = HashMap::with_capacity(candidates.len());
48    for (i, (s, _)) in candidates.into_iter().enumerate() {
49        index.insert(s.clone(), i as u64);
50        entries.push(s);
51    }
52    StringPool { entries, index }
53}
54
55pub fn write_string_pool(writer: &mut BitWriter, pool: &StringPool) {
56    for s in &pool.entries {
57        // 在池里写入原始值:tag::STRING + len + bytes(不写 is_pool_ref)
58        writer.write_bits(tag::STRING as u64, 3);
59        let bytes = s.as_bytes();
60        varint::write_uleb128(writer, bytes.len() as u64);
61        for &b in bytes { writer.write_byte(b); }
62    }
63}
64
65