use std::collections::HashMap;
use crate::error::Result;
#[derive(Debug, Clone)]
struct DictEntry {
pattern: String,
code: String,
#[allow(dead_code)]
frequency: u32,
}
#[derive(Debug, Clone)]
pub struct DictConfig {
pub max_entries: usize,
pub min_pattern_length: usize,
pub min_frequency: u32,
}
impl Default for DictConfig {
fn default() -> Self {
Self {
max_entries: 256,
min_pattern_length: 4,
min_frequency: 2,
}
}
}
#[derive(Debug, Clone)]
pub struct DictCompressResult {
pub data: String,
pub substitutions: usize,
pub bytes_saved: usize,
pub has_dict_header: bool,
}
pub struct DictCompressor {
config: DictConfig,
entries: HashMap<String, DictEntry>,
candidates: HashMap<String, u32>,
next_code: u16,
builtin_entries: Vec<DictEntry>,
}
impl DictCompressor {
pub fn new() -> Self {
Self::with_config(DictConfig::default())
}
pub fn with_config(config: DictConfig) -> Self {
let builtin = build_builtin_dictionary();
let mut compressor = Self {
config,
entries: HashMap::new(),
candidates: HashMap::new(),
next_code: 0,
builtin_entries: builtin,
};
compressor.load_builtins();
compressor
}
fn load_builtins(&mut self) {
for entry in &self.builtin_entries.clone() {
self.entries.insert(entry.pattern.clone(), entry.clone());
}
self.next_code = self.entries.len() as u16;
}
pub fn observe(&mut self, json_str: &str) {
let fields = extract_json_fields(json_str);
for field in fields {
if field.len() >= self.config.min_pattern_length {
let count = self.candidates.entry(field.clone()).or_insert(0);
*count += 1;
if *count >= self.config.min_frequency
&& !self.entries.contains_key(&field)
&& self.entries.len() < self.config.max_entries
{
let code = format!("~{:X}", self.next_code);
self.next_code += 1;
self.entries.insert(
field.clone(),
DictEntry {
pattern: field,
code,
frequency: *count,
},
);
}
}
}
}
pub fn compress(&self, json_str: &str) -> Result<DictCompressResult> {
if self.entries.is_empty() {
return Ok(DictCompressResult {
data: json_str.to_string(),
substitutions: 0,
bytes_saved: 0,
has_dict_header: false,
});
}
let mut result = json_str.to_string();
let mut substitutions = 0usize;
let mut bytes_saved = 0usize;
let mut used_entries: Vec<&DictEntry> = Vec::new();
let mut sorted_entries: Vec<&DictEntry> = self.entries.values().collect();
sorted_entries.sort_by(|a, b| b.pattern.len().cmp(&a.pattern.len()));
for entry in &sorted_entries {
let search = format!("\"{}\":", entry.pattern);
let replace = format!("\"{}\":", entry.code);
if result.contains(&search) {
let count = result.matches(&search).count();
result = result.replace(&search, &replace);
substitutions += count;
bytes_saved += count * (search.len() - replace.len());
used_entries.push(entry);
}
}
let has_dict_header = !used_entries.is_empty();
if has_dict_header {
let header = format_dict_header(&used_entries);
result = format!("{header}\n{result}");
}
Ok(DictCompressResult {
data: result,
substitutions,
bytes_saved,
has_dict_header,
})
}
pub fn dict_size(&self) -> usize {
self.entries.len()
}
pub fn reset(&mut self) {
self.entries.clear();
self.candidates.clear();
self.next_code = 0;
self.load_builtins();
}
}
impl Default for DictCompressor {
fn default() -> Self {
Self::new()
}
}
fn build_builtin_dictionary() -> Vec<DictEntry> {
let common_fields = [
("status_code", "~s"),
("message", "~m"),
("timestamp", "~ts"),
("created_at", "~ca"),
("updated_at", "~ua"),
("deleted_at", "~da"),
("description", "~dc"),
("metadata", "~md"),
("properties", "~pp"),
("attributes", "~at"),
("parameters", "~pm"),
("configuration", "~cf"),
("environment", "~ev"),
("dependencies", "~dp"),
("permissions", "~pr"),
("resources", "~rs"),
("namespace", "~ns"),
("annotations", "~an"),
("error_code", "~ec"),
("error_message", "~em"),
("stack_trace", "~st"),
("exception", "~ex"),
("page_size", "~ps"),
("next_token", "~nt"),
("total_count", "~tc"),
("resource_type", "~rt"),
("account_id", "~ai"),
("region", "~rg"),
];
common_fields
.iter()
.enumerate()
.map(|(_, &(pattern, code))| DictEntry {
pattern: pattern.to_string(),
code: code.to_string(),
frequency: 100, })
.collect()
}
fn extract_json_fields(json_str: &str) -> Vec<String> {
let mut fields = Vec::new();
let bytes = json_str.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'"' {
let start = i + 1;
i += 1;
while i < len && bytes[i] != b'"' {
if bytes[i] == b'\\' {
i += 1; }
i += 1;
}
if i < len {
let end = i;
i += 1;
while i < len && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\n' || bytes[i] == b'\r') {
i += 1;
}
if i < len && bytes[i] == b':' {
if let Ok(field) = std::str::from_utf8(&bytes[start..end]) {
fields.push(field.to_string());
}
}
}
} else {
i += 1;
}
}
fields
}
fn format_dict_header(entries: &[&DictEntry]) -> String {
let mut lines = vec!["§dict§".to_string()];
for entry in entries {
lines.push(format!("{}={}", entry.code, entry.pattern));
}
lines.push("§/dict§".to_string());
lines.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_has_builtin_entries() {
let comp = DictCompressor::new();
assert!(comp.dict_size() > 0);
}
#[test]
fn test_compress_with_known_fields() {
let comp = DictCompressor::new();
let json = r#"{"status_code":200,"message":"ok","timestamp":"2024-01-01"}"#;
let result = comp.compress(json).unwrap();
assert!(result.substitutions > 0);
assert!(result.bytes_saved > 0);
assert!(result.has_dict_header);
assert!(result.data.contains("§dict§"));
}
#[test]
fn test_compress_no_known_fields() {
let mut comp = DictCompressor::new();
comp.entries.clear(); let json = r#"{"x":1,"y":2}"#;
let result = comp.compress(json).unwrap();
assert_eq!(result.substitutions, 0);
assert_eq!(result.bytes_saved, 0);
assert!(!result.has_dict_header);
assert_eq!(result.data, json);
}
#[test]
fn test_observe_learns_patterns() {
let mut comp = DictCompressor::new();
let initial_size = comp.dict_size();
let json = r#"{"custom_field_name":1,"another_long_field":2}"#;
comp.observe(json);
comp.observe(json);
assert!(comp.dict_size() > initial_size);
}
#[test]
fn test_observe_ignores_short_fields() {
let mut comp = DictCompressor::new();
let initial_size = comp.dict_size();
let json = r#"{"id":1,"x":2}"#;
comp.observe(json);
comp.observe(json);
assert_eq!(comp.dict_size(), initial_size);
}
#[test]
fn test_extract_json_fields() {
let json = r#"{"name":"Alice","age":30,"nested":{"key":"val"}}"#;
let fields = extract_json_fields(json);
assert!(fields.contains(&"name".to_string()));
assert!(fields.contains(&"age".to_string()));
assert!(fields.contains(&"nested".to_string()));
assert!(fields.contains(&"key".to_string()));
}
#[test]
fn test_extract_json_fields_with_escaped_quotes() {
let json = r#"{"field_with_\"quotes\"":"value","normal":1}"#;
let fields = extract_json_fields(json);
assert!(fields.contains(&"normal".to_string()));
}
#[test]
fn test_reset_clears_learned_entries() {
let mut comp = DictCompressor::new();
let json = r#"{"custom_field_name":1}"#;
comp.observe(json);
comp.observe(json);
let size_after_learn = comp.dict_size();
comp.reset();
assert!(comp.dict_size() <= size_after_learn);
assert!(comp.candidates.is_empty());
}
#[test]
fn test_dict_header_format() {
let entry = DictEntry {
pattern: "status_code".to_string(),
code: "~s".to_string(),
frequency: 10,
};
let entries = vec![&entry];
let header = format_dict_header(&entries);
assert!(header.starts_with("§dict§"));
assert!(header.ends_with("§/dict§"));
assert!(header.contains("~s=status_code"));
}
use proptest::prelude::*;
proptest! {
#[test]
fn prop_compression_does_not_increase_content(
field_name in "[a-z_]{5,15}",
value in "[a-z0-9]{1,20}",
repeat in 2usize..=5usize,
) {
let mut comp = DictCompressor::new();
comp.entries.clear();
let entry = format!("\"{}\":\"{}\"", field_name, value);
let json = format!("{{{}}}", std::iter::repeat(entry.as_str())
.take(repeat)
.collect::<Vec<_>>()
.join(","));
for _ in 0..3 {
comp.observe(&json);
}
let result = comp.compress(&json).unwrap();
if result.substitutions > 0 {
prop_assert!(
result.bytes_saved > 0,
"substitutions made but no bytes saved"
);
}
}
}
}