use crate::error::CodecError;
const MAX_SYMBOLS: usize = 255;
const MAX_SYMBOL_LEN: usize = 8;
const ESCAPE: u8 = 255;
const TRAINING_ROUNDS: usize = 5;
#[derive(Debug, Clone)]
struct SymbolTable {
symbols: Vec<Vec<u8>>,
}
impl SymbolTable {
fn train(strings: &[&[u8]]) -> Self {
if strings.is_empty() {
return Self {
symbols: Vec::new(),
};
}
let mut symbols: Vec<Vec<u8>> = Vec::new();
let mut symbol_set: std::collections::HashSet<Vec<u8>> = std::collections::HashSet::new();
let mut candidates: std::collections::HashMap<Vec<u8>, usize> =
std::collections::HashMap::new();
for _round in 0..TRAINING_ROUNDS {
candidates.clear();
for s in strings {
let mut pos = 0;
while pos < s.len() {
let existing_match = longest_symbol_match(&symbols, s, pos);
if existing_match > 0 {
pos += existing_match;
continue;
}
for len in 1..=MAX_SYMBOL_LEN.min(s.len() - pos) {
let ngram = &s[pos..pos + len];
*candidates.entry(ngram.to_vec()).or_insert(0) += 1;
}
pos += 1;
}
}
if candidates.is_empty() {
break;
}
let mut scored: Vec<(Vec<u8>, usize)> = candidates
.drain()
.map(|(ngram, freq)| {
let gain = freq * (ngram.len().saturating_sub(1));
(ngram, gain)
})
.filter(|(_, gain)| *gain > 0)
.collect();
scored.sort_by_key(|a| std::cmp::Reverse(a.1));
for (ngram, _) in scored {
if symbols.len() >= MAX_SYMBOLS {
break;
}
if symbol_set.insert(ngram.clone()) {
symbols.push(ngram);
}
}
}
symbols.sort_by_key(|a| std::cmp::Reverse(a.len()));
Self { symbols }
}
fn symbol_count(&self) -> usize {
self.symbols.len()
}
}
fn longest_symbol_match(symbols: &[Vec<u8>], data: &[u8], pos: usize) -> usize {
let remaining = &data[pos..];
for sym in symbols {
if remaining.starts_with(sym) {
return sym.len();
}
}
0
}
pub fn encode(strings: &[&[u8]]) -> Vec<u8> {
let table = SymbolTable::train(strings);
let mut encoded_strings: Vec<Vec<u8>> = Vec::with_capacity(strings.len());
for s in strings {
encoded_strings.push(encode_string(&table, s));
}
let mut out = Vec::new();
out.extend_from_slice(&(table.symbol_count() as u16).to_le_bytes());
for sym in &table.symbols {
out.push(sym.len() as u8);
out.extend_from_slice(sym);
}
let total_encoded: usize = encoded_strings.iter().map(|s| s.len()).sum();
out.extend_from_slice(&(total_encoded as u32).to_le_bytes());
out.extend_from_slice(&(strings.len() as u32).to_le_bytes());
let mut offset = 0u32;
for es in &encoded_strings {
offset += es.len() as u32;
out.extend_from_slice(&offset.to_le_bytes());
}
for es in &encoded_strings {
out.extend_from_slice(es);
}
out
}
pub fn decode(data: &[u8]) -> Result<Vec<Vec<u8>>, CodecError> {
if data.len() < 2 {
return Err(CodecError::Truncated {
expected: 2,
actual: data.len(),
});
}
let sym_count = u16::from_le_bytes([data[0], data[1]]) as usize;
let mut pos = 2;
let mut symbols: Vec<Vec<u8>> = Vec::with_capacity(sym_count);
for _ in 0..sym_count {
if pos >= data.len() {
return Err(CodecError::Truncated {
expected: pos + 1,
actual: data.len(),
});
}
let len = data[pos] as usize;
pos += 1;
if pos + len > data.len() {
return Err(CodecError::Truncated {
expected: pos + len,
actual: data.len(),
});
}
symbols.push(data[pos..pos + len].to_vec());
pos += len;
}
if pos + 8 > data.len() {
return Err(CodecError::Truncated {
expected: pos + 8,
actual: data.len(),
});
}
let _total_encoded =
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
pos += 4;
let string_count =
u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]]) as usize;
pos += 4;
let offsets_size = string_count * 4;
if pos + offsets_size > data.len() {
return Err(CodecError::Truncated {
expected: pos + offsets_size,
actual: data.len(),
});
}
let mut offsets = Vec::with_capacity(string_count);
for i in 0..string_count {
let off_pos = pos + i * 4;
offsets.push(u32::from_le_bytes([
data[off_pos],
data[off_pos + 1],
data[off_pos + 2],
data[off_pos + 3],
]) as usize);
}
pos += offsets_size;
let encoded_data = &data[pos..];
let mut result = Vec::with_capacity(string_count);
let mut prev_end = 0;
for &end in &offsets {
if end > encoded_data.len() {
return Err(CodecError::Truncated {
expected: pos + end,
actual: data.len(),
});
}
let encoded_str = &encoded_data[prev_end..end];
result.push(decode_string(&symbols, encoded_str)?);
prev_end = end;
}
Ok(result)
}
pub fn encode_delimited(data: &[u8], delimiter: u8) -> Vec<u8> {
let strings: Vec<&[u8]> = data.split(|&b| b == delimiter).collect();
encode(&strings)
}
pub fn decode_delimited(data: &[u8], delimiter: u8) -> Result<Vec<u8>, CodecError> {
let strings = decode(data)?;
let mut out = Vec::new();
for (i, s) in strings.iter().enumerate() {
if i > 0 {
out.push(delimiter);
}
out.extend_from_slice(s);
}
Ok(out)
}
fn encode_string(table: &SymbolTable, input: &[u8]) -> Vec<u8> {
let mut out = Vec::with_capacity(input.len());
let mut pos = 0;
while pos < input.len() {
let mut matched = false;
for (idx, sym) in table.symbols.iter().enumerate() {
if input[pos..].starts_with(sym) {
out.push(idx as u8);
pos += sym.len();
matched = true;
break;
}
}
if !matched {
out.push(ESCAPE);
out.push(input[pos]);
pos += 1;
}
}
out
}
fn decode_string(symbols: &[Vec<u8>], encoded: &[u8]) -> Result<Vec<u8>, CodecError> {
let mut out = Vec::with_capacity(encoded.len() * 2);
let mut pos = 0;
while pos < encoded.len() {
let byte = encoded[pos];
pos += 1;
if byte == ESCAPE {
if pos >= encoded.len() {
return Err(CodecError::Corrupt {
detail: "FSST escape at end of encoded data".into(),
});
}
out.push(encoded[pos]);
pos += 1;
} else {
let idx = byte as usize;
if idx >= symbols.len() {
return Err(CodecError::Corrupt {
detail: format!(
"FSST symbol index {idx} out of range (max {})",
symbols.len()
),
});
}
out.extend_from_slice(&symbols[idx]);
}
}
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input() {
let encoded = encode(&[]);
let decoded = decode(&encoded).unwrap();
assert!(decoded.is_empty());
}
#[test]
fn single_string() {
let strings: Vec<&[u8]> = vec![b"hello world"];
let encoded = encode(&strings);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), 1);
assert_eq!(decoded[0], b"hello world");
}
#[test]
fn multiple_strings_roundtrip() {
let strings: Vec<&[u8]> = vec![
b"us-east-1",
b"us-east-2",
b"us-west-1",
b"eu-west-1",
b"us-east-1",
b"us-east-1",
];
let encoded = encode(&strings);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), strings.len());
for (a, b) in strings.iter().zip(decoded.iter()) {
assert_eq!(*a, b.as_slice());
}
}
#[test]
fn repetitive_log_lines() {
let lines: Vec<&[u8]> = (0..1000)
.map(|i| {
let s: &[u8] = match i % 5 {
0 => b"2024-01-15 INFO server.handler request_id=abc method=GET status=200",
1 => b"2024-01-15 INFO server.handler request_id=def method=POST status=201",
2 => b"2024-01-15 WARN server.handler request_id=ghi method=GET status=404",
3 => b"2024-01-15 ERROR server.handler request_id=jkl method=PUT status=500",
_ => b"2024-01-15 DEBUG server.handler request_id=mno method=GET status=200",
};
s
})
.collect();
let encoded = encode(&lines);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), lines.len());
for (a, b) in lines.iter().zip(decoded.iter()) {
assert_eq!(*a, b.as_slice());
}
let raw_size: usize = lines.iter().map(|s| s.len()).sum();
let ratio = raw_size as f64 / encoded.len() as f64;
assert!(
ratio > 1.5,
"FSST should compress repetitive logs >1.5x, got {ratio:.1}x"
);
}
#[test]
fn hostnames() {
let hosts: Vec<&[u8]> = vec![
b"prod-web-01.us-east-1.example.com",
b"prod-web-02.us-east-1.example.com",
b"prod-web-03.us-east-1.example.com",
b"prod-api-01.us-west-2.example.com",
b"prod-api-02.us-west-2.example.com",
b"staging-web-01.eu-west-1.example.com",
];
let encoded = encode(&hosts);
let decoded = decode(&encoded).unwrap();
for (a, b) in hosts.iter().zip(decoded.iter()) {
assert_eq!(*a, b.as_slice());
}
}
#[test]
fn binary_data() {
let data: Vec<&[u8]> = vec![&[0, 1, 2, 3, 4, 255, 254, 253]];
let encoded = encode(&data);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded[0], data[0]);
}
#[test]
fn empty_strings() {
let strings: Vec<&[u8]> = vec![b"", b"hello", b"", b"world", b""];
let encoded = encode(&strings);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), 5);
assert!(decoded[0].is_empty());
assert_eq!(decoded[1], b"hello");
assert!(decoded[2].is_empty());
}
#[test]
fn delimited_roundtrip() {
let data = b"line one\nline two\nline three\nline one\nline two";
let encoded = encode_delimited(data, b'\n');
let decoded = decode_delimited(&encoded, b'\n').unwrap();
assert_eq!(decoded, data);
}
#[test]
fn compression_ratio_structured_logs() {
let mut lines: Vec<Vec<u8>> = Vec::new();
for i in 0..5000 {
let line = format!(
"2024-01-15T10:30:{:02}.000Z INFO server.handler request_id={} method=GET path=/api/v1/metrics status=200 duration_ms={}",
i % 60,
10000 + i,
i * 3 + 1
);
lines.push(line.into_bytes());
}
let refs: Vec<&[u8]> = lines.iter().map(|l| l.as_slice()).collect();
let encoded = encode(&refs);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), lines.len());
let raw_size: usize = lines.iter().map(|s| s.len()).sum();
let ratio = raw_size as f64 / encoded.len() as f64;
assert!(
ratio > 1.5,
"FSST should compress structured logs >1.5x, got {ratio:.1}x"
);
}
#[test]
fn truncated_input_errors() {
assert!(decode(&[]).is_err());
assert!(decode(&[1]).is_err());
}
#[test]
fn large_dataset() {
let mut strings: Vec<Vec<u8>> = Vec::new();
for i in 0..10_000 {
strings.push(format!("key-{}-value-{}", i % 100, i % 50).into_bytes());
}
let refs: Vec<&[u8]> = strings.iter().map(|s| s.as_slice()).collect();
let encoded = encode(&refs);
let decoded = decode(&encoded).unwrap();
assert_eq!(decoded.len(), strings.len());
for (a, b) in strings.iter().zip(decoded.iter()) {
assert_eq!(a.as_slice(), b.as_slice());
}
}
}