use std::collections::HashMap;
use crate::vwi::encode_vwi_inv;
pub const MAX_STRING_LENGTH: usize = 500;
pub const RECORD_LIMIT: usize = 0x10000 - 1024;
#[derive(Debug, Default)]
pub struct CncxBuilder {
records: Vec<Vec<u8>>,
dedup: HashMap<String, u32>,
}
impl CncxBuilder {
pub fn new() -> Self {
Self {
records: vec![Vec::new()],
dedup: HashMap::new(),
}
}
#[allow(dead_code)]
pub fn len(&self) -> usize {
self.dedup.len()
}
pub fn add(&mut self, s: &str) -> u32 {
if let Some(&off) = self.dedup.get(s) {
return off;
}
let s_trunc = if s.len() > MAX_STRING_LENGTH {
let mut cut = MAX_STRING_LENGTH;
while !s.is_char_boundary(cut) {
cut -= 1;
}
&s[..cut]
} else {
s
};
let bytes = s_trunc.as_bytes();
let len_prefix = encode_vwi_inv(bytes.len() as u32);
let entry_size = len_prefix.len() + bytes.len();
let current = self.records.last_mut().unwrap();
if current.len() + entry_size > RECORD_LIMIT {
self.records.push(Vec::new());
}
let record_index = (self.records.len() - 1) as u32;
let current = self.records.last_mut().unwrap();
let byte_offset = current.len() as u32;
current.extend_from_slice(&len_prefix);
current.extend_from_slice(bytes);
let cncx_offset = record_index * 0x10000 + byte_offset;
self.dedup.insert(s.to_string(), cncx_offset);
cncx_offset
}
pub fn into_records(self) -> Vec<Vec<u8>> {
let mut out = Vec::with_capacity(self.records.len());
for mut rec in self.records.into_iter() {
if rec.is_empty() {
continue;
}
while rec.len() % 4 != 0 {
rec.push(0);
}
out.push(rec);
}
out
}
#[allow(dead_code)]
pub fn record_count(&self) -> usize {
self.records.iter().filter(|r| !r.is_empty()).count()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_builder_emits_no_records() {
let b = CncxBuilder::new();
assert_eq!(b.into_records().len(), 0);
}
#[test]
fn single_string_offset_is_zero() {
let mut b = CncxBuilder::new();
let off = b.add("P-//*[@aid='0']");
assert_eq!(off, 0);
}
#[test]
fn dedup_returns_same_offset() {
let mut b = CncxBuilder::new();
let a = b.add("foo");
let c = b.add("bar");
let a2 = b.add("foo");
assert_eq!(a, a2);
assert_ne!(a, c);
}
#[test]
fn offset_format_includes_record_index_shift() {
let mut b = CncxBuilder::new();
let long = "x".repeat(400);
for i in 0..200 {
b.add(&format!("{}{}", long, i));
}
let late = b.add("last_one");
assert!(late >> 16 >= 1, "late offset {} should include record index >=1", late);
}
#[test]
fn string_length_truncated_at_500() {
let long = "a".repeat(800);
let mut b = CncxBuilder::new();
b.add(&long);
let recs = b.into_records();
assert_eq!(recs.len(), 1);
assert!(recs[0].len() >= 500);
}
}