use anyhow::anyhow;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Read, Write};
use std::path::Path;
use trustformers_core::errors::{Result, TrustformersError};
const BINARY_FORMAT_VERSION: u32 = 1;
const MAGIC_BYTES: &[u8] = b"TFMT";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BinaryHeader {
pub version: u32,
pub tokenizer_type: String,
pub compression_level: u8,
pub uncompressed_size: u64,
pub compressed_size: u64,
pub checksum: u32,
pub metadata: HashMap<String, String>,
pub created_at: u64,
}
#[derive(Debug, Clone)]
pub struct BinaryConfig {
pub compression_level: u8,
pub include_metadata: bool,
pub verify_checksums: bool,
pub buffer_size: usize,
pub use_memory_mapping: bool,
}
impl Default for BinaryConfig {
fn default() -> Self {
Self {
compression_level: 6,
include_metadata: true,
verify_checksums: true,
buffer_size: 64 * 1024, use_memory_mapping: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BinaryTokenizer {
pub vocab: HashMap<String, u32>,
pub id_to_token: HashMap<u32, String>,
pub special_tokens: HashMap<String, u32>,
pub scores: Option<HashMap<u32, f32>>,
pub merges: Option<Vec<(String, String)>>,
pub config: HashMap<String, serde_json::Value>,
pub normalization_rules: Option<Vec<NormalizationRule>>,
pub pre_tokenization_rules: Option<Vec<PreTokenizationRule>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NormalizationRule {
pub rule_type: String,
pub parameters: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PreTokenizationRule {
pub rule_type: String,
pub pattern: String,
pub replacement: Option<String>,
}
pub struct BinarySerializer {
config: BinaryConfig,
}
impl BinarySerializer {
pub fn new(config: BinaryConfig) -> Self {
Self { config }
}
pub fn serialize<P: AsRef<Path>>(
&self,
tokenizer: &BinaryTokenizer,
tokenizer_type: &str,
path: P,
) -> Result<BinaryHeader> {
let file = File::create(path.as_ref())
.map_err(|e| TrustformersError::io_error(format!("Failed to create file: {}", e)))?;
let mut writer = BufWriter::with_capacity(self.config.buffer_size, file);
let data =
oxicode::serde::encode_to_vec(tokenizer, oxicode::config::standard()).map_err(|e| {
TrustformersError::serialization_error(format!(
"Failed to serialize tokenizer: {}",
e
))
})?;
let checksum = crc32fast::hash(&data);
let (final_data, compressed_size) = if self.config.compression_level > 0 {
let compressed = self.compress_data(&data)?;
let size = compressed.len() as u64;
(compressed, size)
} else {
let size = data.len() as u64;
(data.clone(), size)
};
let mut metadata = HashMap::new();
if self.config.include_metadata {
metadata.insert("vocab_size".to_string(), tokenizer.vocab.len().to_string());
metadata.insert(
"has_scores".to_string(),
tokenizer.scores.is_some().to_string(),
);
metadata.insert(
"has_merges".to_string(),
tokenizer.merges.is_some().to_string(),
);
}
let header = BinaryHeader {
version: BINARY_FORMAT_VERSION,
tokenizer_type: tokenizer_type.to_string(),
compression_level: self.config.compression_level,
uncompressed_size: data.len() as u64,
compressed_size,
checksum,
metadata,
created_at: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
};
writer.write_all(MAGIC_BYTES).map_err(|e| {
TrustformersError::io_error(format!("Failed to write magic bytes: {}", e))
})?;
let header_data = oxicode::serde::encode_to_vec(&header, oxicode::config::standard())
.map_err(|e| {
TrustformersError::serialization_error(format!("Failed to serialize header: {}", e))
})?;
let header_size = header_data.len() as u32;
writer.write_all(&header_size.to_le_bytes()).map_err(|e| {
TrustformersError::io_error(format!("Failed to write header size: {}", e))
})?;
writer
.write_all(&header_data)
.map_err(|e| TrustformersError::io_error(format!("Failed to write header: {}", e)))?;
writer.write_all(&final_data).map_err(|e| {
TrustformersError::io_error(format!("Failed to write tokenizer data: {}", e))
})?;
writer
.flush()
.map_err(|e| TrustformersError::io_error(format!("Failed to flush writer: {}", e)))?;
Ok(header)
}
pub fn deserialize<P: AsRef<Path>>(&self, path: P) -> Result<(BinaryTokenizer, BinaryHeader)> {
let file = File::open(path.as_ref())
.map_err(|e| TrustformersError::io_error(format!("Failed to open file: {}", e)))?;
let mut reader = BufReader::with_capacity(self.config.buffer_size, file);
let mut magic = [0u8; 4];
reader.read_exact(&mut magic).map_err(|e| {
TrustformersError::io_error(format!("Failed to read magic bytes: {}", e))
})?;
if magic != MAGIC_BYTES {
return Err(trustformers_core::errors::invalid_format(
"TFMT",
String::from_utf8_lossy(&magic).to_string(),
));
}
let mut header_size_bytes = [0u8; 4];
reader.read_exact(&mut header_size_bytes).map_err(|e| {
TrustformersError::io_error(format!("Failed to read header size: {}", e))
})?;
let header_size = u32::from_le_bytes(header_size_bytes) as usize;
let mut header_data = vec![0u8; header_size];
reader
.read_exact(&mut header_data)
.map_err(|e| TrustformersError::io_error(format!("Failed to read header: {}", e)))?;
let (header, _): (BinaryHeader, usize) = oxicode::serde::decode_from_slice(
&header_data,
oxicode::config::standard(),
)
.map_err(|e| {
TrustformersError::serialization_error(format!("Failed to deserialize header: {}", e))
})?;
if header.version > BINARY_FORMAT_VERSION {
return Err(trustformers_core::errors::invalid_format(
BINARY_FORMAT_VERSION.to_string(),
header.version.to_string(),
));
}
let mut data = vec![0u8; header.compressed_size as usize];
reader.read_exact(&mut data).map_err(|e| {
TrustformersError::io_error(format!("Failed to read tokenizer data: {}", e))
})?;
let final_data = if header.compression_level > 0 {
self.decompress_data(&data, header.uncompressed_size as usize)?
} else {
data
};
if self.config.verify_checksums {
let calculated_checksum = crc32fast::hash(&final_data);
if calculated_checksum != header.checksum {
return Err(trustformers_core::errors::invalid_format(
header.checksum.to_string(),
calculated_checksum.to_string(),
));
}
}
let (tokenizer, _): (BinaryTokenizer, usize) =
oxicode::serde::decode_from_slice(&final_data, oxicode::config::standard()).map_err(
|e| {
TrustformersError::serialization_error(format!(
"Failed to deserialize tokenizer: {}",
e
))
},
)?;
Ok((tokenizer, header))
}
fn compress_data(&self, data: &[u8]) -> Result<Vec<u8>> {
use oxiarc_deflate::streaming::ZlibStreamEncoder;
let mut encoder = ZlibStreamEncoder::new(Vec::new(), self.config.compression_level);
encoder.write_all(data).map_err(|e| {
TrustformersError::other(anyhow::anyhow!("Failed to compress data: {}", e).to_string())
})?;
encoder.finish().map_err(|e| {
TrustformersError::other(
anyhow::anyhow!("Failed to finish compression: {}", e).to_string(),
)
})
}
fn decompress_data(&self, compressed_data: &[u8], expected_size: usize) -> Result<Vec<u8>> {
use oxiarc_deflate::streaming::ZlibStreamDecoder;
let mut decoder = ZlibStreamDecoder::new(compressed_data);
let mut decompressed = Vec::with_capacity(expected_size);
decoder.read_to_end(&mut decompressed).map_err(|e| {
TrustformersError::other(
anyhow::anyhow!("Failed to decompress data: {}", e).to_string(),
)
})?;
if decompressed.len() != expected_size {
return Err(TrustformersError::other(
anyhow::anyhow!(
"Decompressed size mismatch: expected {}, got {}",
expected_size,
decompressed.len()
)
.to_string(),
));
}
Ok(decompressed)
}
pub fn get_file_info<P: AsRef<Path>>(&self, path: P) -> Result<BinaryHeader> {
let file = File::open(path.as_ref())
.map_err(|e| TrustformersError::io_error(format!("Failed to open file: {}", e)))?;
let mut reader = BufReader::new(file);
let mut magic = [0u8; 4];
reader.read_exact(&mut magic).map_err(|e| {
TrustformersError::io_error(format!("Failed to read magic bytes: {}", e))
})?;
if magic != MAGIC_BYTES {
return Err(trustformers_core::errors::invalid_format(
"TFMT",
String::from_utf8_lossy(&magic).to_string(),
));
}
let mut header_size_bytes = [0u8; 4];
reader.read_exact(&mut header_size_bytes).map_err(|e| {
TrustformersError::io_error(format!("Failed to read header size: {}", e))
})?;
let header_size = u32::from_le_bytes(header_size_bytes) as usize;
let mut header_data = vec![0u8; header_size];
reader
.read_exact(&mut header_data)
.map_err(|e| TrustformersError::io_error(format!("Failed to read header: {}", e)))?;
let (header, _): (BinaryHeader, usize) = oxicode::serde::decode_from_slice(
&header_data,
oxicode::config::standard(),
)
.map_err(|e| {
TrustformersError::serialization_error(format!("Failed to deserialize header: {}", e))
})?;
Ok(header)
}
}
pub struct BinaryUtils;
impl BinaryUtils {
pub fn validate_file<P: AsRef<Path>>(path: P, config: &BinaryConfig) -> Result<bool> {
let serializer = BinarySerializer::new(config.clone());
let header = serializer.get_file_info(path.as_ref())?;
if header.version > BINARY_FORMAT_VERSION {
return Ok(false);
}
if header.compressed_size == 0 || header.uncompressed_size == 0 {
return Ok(false);
}
Ok(true)
}
pub fn compare_files<P: AsRef<Path>>(
path1: P,
path2: P,
config: &BinaryConfig,
) -> Result<bool> {
let serializer = BinarySerializer::new(config.clone());
let header1 = serializer.get_file_info(path1.as_ref())?;
let header2 = serializer.get_file_info(path2.as_ref())?;
Ok(header1.checksum == header2.checksum)
}
pub fn get_compression_ratio<P: AsRef<Path>>(path: P, config: &BinaryConfig) -> Result<f64> {
let serializer = BinarySerializer::new(config.clone());
let header = serializer.get_file_info(path)?;
if header.compression_level == 0 {
return Ok(1.0);
}
Ok(header.uncompressed_size as f64 / header.compressed_size as f64)
}
pub fn migrate_format<P: AsRef<Path>>(
old_path: P,
new_path: P,
config: &BinaryConfig,
) -> Result<BinaryHeader> {
let serializer = BinarySerializer::new(config.clone());
let (tokenizer, old_header) = serializer.deserialize(old_path)?;
let tokenizer_type = &old_header.tokenizer_type;
serializer.serialize(&tokenizer, tokenizer_type, new_path)
}
}
pub struct TokenizerConverter;
impl TokenizerConverter {
pub fn from_tokenizer_json<P: AsRef<Path>>(
json_path: P,
binary_path: P,
config: &BinaryConfig,
) -> Result<BinaryHeader> {
let json_content = std::fs::read_to_string(json_path.as_ref())
.map_err(|e| TrustformersError::io_error(format!("Failed to read JSON file: {}", e)))?;
let json_value: serde_json::Value = serde_json::from_str(&json_content).map_err(|e| {
TrustformersError::serialization_error(format!("Failed to parse JSON: {}", e))
})?;
let mut vocab = HashMap::new();
let mut id_to_token = HashMap::new();
if let Some(model) = json_value.get("model") {
if let Some(vocab_obj) = model.get("vocab") {
if let Some(vocab_map) = vocab_obj.as_object() {
for (token, id) in vocab_map {
if let Some(id_num) = id.as_u64() {
let id_u32 = id_num as u32;
vocab.insert(token.clone(), id_u32);
id_to_token.insert(id_u32, token.clone());
}
}
}
}
}
let mut special_tokens = HashMap::new();
if let Some(added_tokens) = json_value.get("added_tokens") {
if let Some(tokens_array) = added_tokens.as_array() {
for token_obj in tokens_array {
if let Some(content) = token_obj.get("content") {
if let Some(id) = token_obj.get("id") {
if let (Some(token_str), Some(id_num)) = (content.as_str(), id.as_u64())
{
special_tokens.insert(token_str.to_string(), id_num as u32);
}
}
}
}
}
}
let merges = if let Some(model) = json_value.get("model") {
if let Some(merges_array) = model.get("merges") {
if let Some(merges_vec) = merges_array.as_array() {
let mut extracted_merges = Vec::new();
for merge in merges_vec {
if let Some(merge_str) = merge.as_str() {
let parts: Vec<&str> = merge_str.split(' ').collect();
if parts.len() == 2 {
extracted_merges.push((parts[0].to_string(), parts[1].to_string()));
}
}
}
Some(extracted_merges)
} else {
None
}
} else {
None
}
} else {
None
};
let binary_tokenizer = BinaryTokenizer {
vocab,
id_to_token,
special_tokens,
scores: None, merges,
config: HashMap::new(),
normalization_rules: None,
pre_tokenization_rules: None,
};
let tokenizer_type = if let Some(model) = json_value.get("model") {
if let Some(type_str) = model.get("type") {
type_str.as_str().unwrap_or("unknown").to_string()
} else {
"unknown".to_string()
}
} else {
"unknown".to_string()
};
let serializer = BinarySerializer::new(config.clone());
serializer.serialize(&binary_tokenizer, &tokenizer_type, binary_path)
}
pub fn from_sentencepiece<P: AsRef<Path>>(
sp_path: P,
binary_path: P,
config: &BinaryConfig,
) -> Result<BinaryHeader> {
let sp_path = sp_path.as_ref();
let (vocab, id_to_token, special_tokens, scores, sp_config) =
Self::load_sentencepiece_model(sp_path)?;
let binary_tokenizer = BinaryTokenizer {
vocab,
id_to_token,
special_tokens,
scores: Some(scores),
merges: None, config: sp_config
.into_iter()
.map(|(k, v)| (k, serde_json::Value::String(v.to_string())))
.collect(),
normalization_rules: Some(Self::extract_normalization_rules()),
pre_tokenization_rules: Some(Self::extract_pre_tokenization_rules()),
};
let serializer = BinarySerializer::new(config.clone());
serializer.serialize(&binary_tokenizer, "sentencepiece", binary_path)
}
fn load_sentencepiece_model<P: AsRef<Path>>(
sp_path: P,
) -> Result<(
HashMap<String, u32>,
HashMap<u32, String>,
HashMap<String, u32>,
HashMap<u32, f32>,
HashMap<String, String>,
)> {
let sp_path = sp_path.as_ref();
if sp_path.extension().and_then(|s| s.to_str()) == Some("model") {
Self::load_sentencepiece_protobuf(sp_path)
} else {
Self::load_sentencepiece_vocab(sp_path)
}
}
fn load_sentencepiece_protobuf<P: AsRef<Path>>(
model_path: P,
) -> Result<(
HashMap<String, u32>,
HashMap<u32, String>,
HashMap<String, u32>,
HashMap<u32, f32>,
HashMap<String, String>,
)> {
let mut file = File::open(model_path).map_err(|e| {
TrustformersError::other(
anyhow!("Failed to open SentencePiece model file: {}", e).to_string(),
)
})?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).map_err(|e| {
TrustformersError::other(
anyhow!("Failed to read SentencePiece model file: {}", e).to_string(),
)
})?;
Self::parse_sentencepiece_protobuf(&buffer)
}
fn parse_sentencepiece_protobuf(
data: &[u8],
) -> Result<(
HashMap<String, u32>,
HashMap<u32, String>,
HashMap<String, u32>,
HashMap<u32, f32>,
HashMap<String, String>,
)> {
let mut vocab = HashMap::new();
let mut id_to_token = HashMap::new();
let mut special_tokens = HashMap::new();
let mut scores = HashMap::new();
let mut config = HashMap::new();
let standard_tokens = vec![
("<unk>", 0, -100.0, true),
("<s>", 1, -1.0, true),
("</s>", 2, -1.0, true),
("<pad>", 3, -1.0, true),
];
for (token, id, score, is_special) in standard_tokens {
vocab.insert(token.to_string(), id);
id_to_token.insert(id, token.to_string());
scores.insert(id, score);
if is_special {
special_tokens.insert(token.to_string(), id);
}
}
let mut current_id = 4;
let mut i = 0;
while i < data.len() {
if let Some(token_data) = Self::extract_token_from_protobuf(data, &mut i) {
let (token, score) = token_data;
if !vocab.contains_key(&token) {
vocab.insert(token.clone(), current_id);
id_to_token.insert(current_id, token.clone());
scores.insert(current_id, score);
current_id += 1;
}
} else {
i += 1;
}
}
config.insert("model_type".to_string(), "sentencepiece".to_string());
config.insert("vocab_size".to_string(), vocab.len().to_string());
config.insert("normalization".to_string(), "nfkc".to_string());
config.insert("add_dummy_prefix".to_string(), "true".to_string());
Ok((vocab, id_to_token, special_tokens, scores, config))
}
fn extract_token_from_protobuf(data: &[u8], pos: &mut usize) -> Option<(String, f32)> {
if *pos >= data.len() {
return None;
}
let start = *pos;
let mut end = start;
while end < data.len() && end < start + 50 {
if data[end] == 0
|| (data[end] < 32 && data[end] != 9 && data[end] != 10 && data[end] != 13)
{
break;
}
end += 1;
}
if end > start {
if let Ok(token) = String::from_utf8(data[start..end].to_vec()) {
let clean_token = token.trim().to_string();
if !clean_token.is_empty() && Self::is_valid_token(&clean_token) {
*pos = end + 1;
let score = Self::estimate_token_score(&clean_token);
return Some((clean_token, score));
}
}
}
*pos += 1;
None
}
fn load_sentencepiece_vocab<P: AsRef<Path>>(
vocab_path: P,
) -> Result<(
HashMap<String, u32>,
HashMap<u32, String>,
HashMap<String, u32>,
HashMap<u32, f32>,
HashMap<String, String>,
)> {
let file = File::open(vocab_path).map_err(|e| {
TrustformersError::other(
anyhow!("Failed to open SentencePiece vocab file: {}", e).to_string(),
)
})?;
let reader = BufReader::new(file);
let mut vocab = HashMap::new();
let mut id_to_token = HashMap::new();
let mut special_tokens = HashMap::new();
let mut scores = HashMap::new();
let mut config = HashMap::new();
for (line_num, line) in reader.lines().enumerate() {
let line = line.map_err(|e| {
TrustformersError::other(
anyhow!("Failed to read line {}: {}", line_num, e).to_string(),
)
})?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let parts: Vec<&str> = if line.contains('\t') {
line.split('\t').collect()
} else {
line.split_whitespace().collect()
};
if parts.is_empty() {
continue;
}
let token = parts[0].to_string();
let score = if parts.len() > 1 {
parts[1].parse::<f32>().unwrap_or(0.0)
} else {
Self::estimate_token_score(&token)
};
let id = line_num as u32;
vocab.insert(token.clone(), id);
id_to_token.insert(id, token.clone());
scores.insert(id, score);
if token.starts_with('<') && token.ends_with('>') {
special_tokens.insert(token, id);
}
}
config.insert("model_type".to_string(), "sentencepiece".to_string());
config.insert("vocab_size".to_string(), vocab.len().to_string());
config.insert("normalization".to_string(), "nfkc".to_string());
Ok((vocab, id_to_token, special_tokens, scores, config))
}
fn is_valid_token(token: &str) -> bool {
token.len() <= 100
&& !token.trim().is_empty()
&& token.chars().any(|c| !c.is_whitespace())
&& token.chars().all(|c| c.is_ascii() || c as u32 > 127) }
fn estimate_token_score(token: &str) -> f32 {
match token {
"<unk>" => -100.0,
"<s>" | "</s>" | "<pad>" => -1.0,
_ if token.starts_with('<') && token.ends_with('>') => -10.0, _ if token.starts_with("▁") => -5.0 + (token.len() as f32 * -0.1), _ if token.len() == 1 => -2.0, _ if token.len() <= 3 => -3.0 + (token.len() as f32 * -0.2),
_ => -5.0 + (token.len() as f32 * -0.1), }
}
fn extract_normalization_rules() -> Vec<NormalizationRule> {
vec![
NormalizationRule {
rule_type: "NFKC".to_string(),
parameters: {
let mut params = HashMap::new();
params.insert(
"pattern".to_string(),
serde_json::Value::String(".*".to_string()),
);
params.insert(
"replacement".to_string(),
serde_json::Value::String("NFKC_NORMALIZED".to_string()),
);
params.insert("regex".to_string(), serde_json::Value::Bool(false));
params
},
},
NormalizationRule {
rule_type: "RemoveExtraSpaces".to_string(),
parameters: {
let mut params = HashMap::new();
params.insert(
"pattern".to_string(),
serde_json::Value::String(r"\s+".to_string()),
);
params.insert(
"replacement".to_string(),
serde_json::Value::String(" ".to_string()),
);
params.insert("regex".to_string(), serde_json::Value::Bool(true));
params
},
},
]
}
fn extract_pre_tokenization_rules() -> Vec<PreTokenizationRule> {
vec![
PreTokenizationRule {
rule_type: "AddDummyPrefix".to_string(),
pattern: "^".to_string(),
replacement: Some("▁".to_string()),
},
PreTokenizationRule {
rule_type: "SpaceReplacement".to_string(),
pattern: " ".to_string(),
replacement: Some("▁".to_string()),
},
]
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
fn create_test_tokenizer() -> BinaryTokenizer {
let mut vocab = HashMap::new();
let mut id_to_token = HashMap::new();
let mut special_tokens = HashMap::new();
vocab.insert("hello".to_string(), 0);
vocab.insert("world".to_string(), 1);
vocab.insert("<pad>".to_string(), 2);
id_to_token.insert(0, "hello".to_string());
id_to_token.insert(1, "world".to_string());
id_to_token.insert(2, "<pad>".to_string());
special_tokens.insert("<pad>".to_string(), 2);
BinaryTokenizer {
vocab,
id_to_token,
special_tokens,
scores: None,
merges: None,
config: HashMap::new(),
normalization_rules: None,
pre_tokenization_rules: None,
}
}
#[test]
fn test_serialize_deserialize() {
let temp_dir = tempdir().expect("Operation failed in test");
let file_path = temp_dir.path().join("test_tokenizer.bin");
let config = BinaryConfig::default();
let serializer = BinarySerializer::new(config);
let tokenizer = create_test_tokenizer();
let header = serializer
.serialize(&tokenizer, "test", &file_path)
.expect("Operation failed in test");
assert_eq!(header.tokenizer_type, "test");
assert_eq!(header.version, BINARY_FORMAT_VERSION);
let (loaded_tokenizer, loaded_header) =
serializer.deserialize(&file_path).expect("Operation failed in test");
assert_eq!(loaded_tokenizer.vocab, tokenizer.vocab);
assert_eq!(loaded_tokenizer.id_to_token, tokenizer.id_to_token);
assert_eq!(loaded_header.tokenizer_type, "test");
}
#[test]
fn test_compression() {
let temp_dir = tempdir().expect("Operation failed in test");
let file_path = temp_dir.path().join("test_compressed.bin");
let config = BinaryConfig {
compression_level: 9,
..Default::default()
};
let serializer = BinarySerializer::new(config);
let tokenizer = create_test_tokenizer();
let header = serializer
.serialize(&tokenizer, "test", &file_path)
.expect("Operation failed in test");
assert!(header.compressed_size < header.uncompressed_size);
assert_eq!(header.compression_level, 9);
let (loaded_tokenizer, _) =
serializer.deserialize(&file_path).expect("Operation failed in test");
assert_eq!(loaded_tokenizer.vocab, tokenizer.vocab);
}
#[test]
fn test_file_info() {
let temp_dir = tempdir().expect("Operation failed in test");
let file_path = temp_dir.path().join("test_info.bin");
let config = BinaryConfig::default();
let serializer = BinarySerializer::new(config);
let tokenizer = create_test_tokenizer();
let original_header = serializer
.serialize(&tokenizer, "test", &file_path)
.expect("Operation failed in test");
let info_header = serializer.get_file_info(&file_path).expect("Operation failed in test");
assert_eq!(info_header.tokenizer_type, original_header.tokenizer_type);
assert_eq!(info_header.checksum, original_header.checksum);
}
#[test]
fn test_validation() {
let temp_dir = tempdir().expect("Operation failed in test");
let file_path = temp_dir.path().join("test_validate.bin");
let config = BinaryConfig::default();
let serializer = BinarySerializer::new(config.clone());
let tokenizer = create_test_tokenizer();
serializer
.serialize(&tokenizer, "test", &file_path)
.expect("Operation failed in test");
assert!(BinaryUtils::validate_file(&file_path, &config).expect("Operation failed in test"));
}
#[test]
fn test_compression_ratio() {
let temp_dir = tempdir().expect("Operation failed in test");
let file_path = temp_dir.path().join("test_ratio.bin");
let config = BinaryConfig {
compression_level: 6,
..Default::default()
};
let serializer = BinarySerializer::new(config.clone());
let tokenizer = create_test_tokenizer();
serializer
.serialize(&tokenizer, "test", &file_path)
.expect("Operation failed in test");
let ratio = BinaryUtils::get_compression_ratio(&file_path, &config)
.expect("Operation failed in test");
assert!(ratio > 1.0); }
}