use std::io::{BufReader, BufWriter, Write};
use std::path::Path;
use libdictenstein::persistent_artrie_char::PersistentARTrieChar;
use pathmap::paths_serialization::{for_each_deserialized_path, serialize_paths_with_auxdata};
use pathmap::PathMap;
#[inline]
fn is_valid_entry(key: &str) -> bool {
!key.starts_with('\x00')
}
#[derive(Clone, Debug, Default)]
pub struct TranslationStats {
pub entries_translated: u64,
pub artrie_size_bytes: u64,
pub pathmap_size_bytes: u64,
pub compression_ratio: f64,
pub elapsed_seconds: f64,
pub peak_memory_bytes: u64,
}
#[derive(Debug, thiserror::Error)]
pub enum TranslationError {
#[error("Source model not found: {0}")]
SourceNotFound(String),
#[error("Source model is empty")]
EmptySource,
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("Serialization error: {0}")]
Serialization(String),
#[error("PathMap construction error: {0}")]
Construction(String),
}
pub struct PathMapTranslator;
impl PathMapTranslator {
pub fn translate<P1: AsRef<Path>, P2: AsRef<Path>>(
artrie_path: P1,
pathmap_path: P2,
) -> Result<TranslationStats, TranslationError> {
use std::time::Instant;
let start = Instant::now();
let artrie_path = artrie_path.as_ref();
let pathmap_path = pathmap_path.as_ref();
if !artrie_path.exists() {
return Err(TranslationError::SourceNotFound(
artrie_path.display().to_string(),
));
}
let artrie_size = std::fs::metadata(artrie_path)?.len();
log::info!("Translating {:?} to {:?}", artrie_path, pathmap_path);
let trie: PersistentARTrieChar<u64> =
PersistentARTrieChar::open(artrie_path).map_err(|e| {
TranslationError::Io(std::io::Error::other(format!("Failed to open trie: {}", e)))
})?;
let mut entries: Vec<(String, u64)> = Vec::new();
let mut entries_count = 0u64;
for (key, value) in trie.iter_with_values() {
if is_valid_entry(&key) {
entries.push((key, value));
entries_count += 1;
}
}
if entries.is_empty() {
return Err(TranslationError::EmptySource);
}
log::info!("Collected {} entries, building PathMap...", entries.len());
let mut pathmap: PathMap<u64> = PathMap::new();
for (key, value) in &entries {
pathmap.insert(key.as_bytes(), *value);
}
log::info!("PathMap built, serializing to {:?}", pathmap_path);
let paths_path = pathmap_path;
let values_path = pathmap_path.with_extension("values");
let paths_file = std::fs::File::create(paths_path)?;
let mut paths_writer = BufWriter::new(paths_file);
let values_file = std::fs::File::create(&values_path)?;
let mut values_writer = BufWriter::new(values_file);
let rz = pathmap.read_zipper();
let _stats = serialize_paths_with_auxdata(rz, &mut paths_writer, |_idx, _path, value| {
let bytes = value.to_le_bytes();
values_writer
.write_all(&bytes)
.expect("Failed to write value");
})
.map_err(|e| {
TranslationError::Serialization(format!("Failed to serialize paths: {}", e))
})?;
paths_writer.flush()?;
values_writer.flush()?;
let paths_size = std::fs::metadata(paths_path)?.len();
let values_size = std::fs::metadata(&values_path)?.len();
let total_pathmap_size = paths_size + values_size;
let stats = TranslationStats {
entries_translated: entries_count,
artrie_size_bytes: artrie_size,
pathmap_size_bytes: total_pathmap_size,
compression_ratio: if total_pathmap_size > 0 {
artrie_size as f64 / total_pathmap_size as f64
} else {
0.0
},
elapsed_seconds: start.elapsed().as_secs_f64(),
peak_memory_bytes: 0,
};
log::info!(
"Translation complete: {} entries, {:.2}x compression in {:.2}s",
entries_count,
stats.compression_ratio,
stats.elapsed_seconds
);
log::info!(
"Output files: {:?} ({} bytes), {:?} ({} bytes)",
paths_path,
paths_size,
values_path,
values_size
);
Ok(stats)
}
pub fn translate_with_progress<P1, P2, F>(
artrie_path: P1,
pathmap_path: P2,
mut progress: F,
) -> Result<TranslationStats, TranslationError>
where
P1: AsRef<Path>,
P2: AsRef<Path>,
F: FnMut(TranslationProgress),
{
use std::time::Instant;
let start = Instant::now();
let artrie_path = artrie_path.as_ref();
let pathmap_path = pathmap_path.as_ref();
if !artrie_path.exists() {
return Err(TranslationError::SourceNotFound(
artrie_path.display().to_string(),
));
}
let artrie_size = std::fs::metadata(artrie_path)?.len();
progress(TranslationProgress {
phase: TranslationPhase::Loading,
entries_processed: 0,
entries_total: None,
bytes_written: 0,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
log::info!(
"Translating {:?} to {:?} with progress",
artrie_path,
pathmap_path
);
let trie: PersistentARTrieChar<u64> =
PersistentARTrieChar::open(artrie_path).map_err(|e| {
TranslationError::Io(std::io::Error::other(format!("Failed to open trie: {}", e)))
})?;
let mut entries: Vec<(String, u64)> = Vec::new();
let mut entries_count = 0u64;
let mut last_progress = 0u64;
for (key, value) in trie.iter_with_values() {
if is_valid_entry(&key) {
entries.push((key, value));
entries_count += 1;
if entries_count - last_progress >= 100_000 {
last_progress = entries_count;
progress(TranslationProgress {
phase: TranslationPhase::Iterating,
entries_processed: entries_count,
entries_total: None,
bytes_written: 0,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
}
}
}
if entries.is_empty() {
return Err(TranslationError::EmptySource);
}
let total_entries = entries.len() as u64;
progress(TranslationProgress {
phase: TranslationPhase::Building,
entries_processed: total_entries,
entries_total: Some(total_entries),
bytes_written: 0,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
log::info!("Building PathMap from {} entries...", total_entries);
let mut pathmap: PathMap<u64> = PathMap::new();
for (key, value) in &entries {
pathmap.insert(key.as_bytes(), *value);
}
progress(TranslationProgress {
phase: TranslationPhase::Merkleizing,
entries_processed: total_entries,
entries_total: Some(total_entries),
bytes_written: 0,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
progress(TranslationProgress {
phase: TranslationPhase::Saving,
entries_processed: total_entries,
entries_total: Some(total_entries),
bytes_written: 0,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
log::info!("Serializing PathMap to {:?}", pathmap_path);
let paths_path = pathmap_path;
let values_path = pathmap_path.with_extension("values");
let paths_file = std::fs::File::create(paths_path)?;
let mut paths_writer = BufWriter::new(paths_file);
let values_file = std::fs::File::create(&values_path)?;
let mut values_writer = BufWriter::new(values_file);
let rz = pathmap.read_zipper();
let _stats = serialize_paths_with_auxdata(rz, &mut paths_writer, |_idx, _path, value| {
let bytes = value.to_le_bytes();
values_writer
.write_all(&bytes)
.expect("Failed to write value");
})
.map_err(|e| {
TranslationError::Serialization(format!("Failed to serialize paths: {}", e))
})?;
paths_writer.flush()?;
values_writer.flush()?;
let paths_size = std::fs::metadata(paths_path)?.len();
let values_size = std::fs::metadata(&values_path)?.len();
let total_pathmap_size = paths_size + values_size;
progress(TranslationProgress {
phase: TranslationPhase::Complete,
entries_processed: total_entries,
entries_total: Some(total_entries),
bytes_written: total_pathmap_size,
elapsed_seconds: start.elapsed().as_secs_f64(),
});
let stats = TranslationStats {
entries_translated: total_entries,
artrie_size_bytes: artrie_size,
pathmap_size_bytes: total_pathmap_size,
compression_ratio: if total_pathmap_size > 0 {
artrie_size as f64 / total_pathmap_size as f64
} else {
0.0
},
elapsed_seconds: start.elapsed().as_secs_f64(),
peak_memory_bytes: 0,
};
log::info!(
"Translation complete: {} entries, {:.2}x compression in {:.2}s",
total_entries,
stats.compression_ratio,
stats.elapsed_seconds
);
Ok(stats)
}
pub fn verify<P1: AsRef<Path>, P2: AsRef<Path>>(
artrie_path: P1,
pathmap_path: P2,
) -> Result<VerificationResult, TranslationError> {
let artrie_path = artrie_path.as_ref();
let pathmap_path = pathmap_path.as_ref();
if !artrie_path.exists() {
return Err(TranslationError::SourceNotFound(
artrie_path.display().to_string(),
));
}
if !pathmap_path.exists() {
return Err(TranslationError::SourceNotFound(
pathmap_path.display().to_string(),
));
}
let values_path = pathmap_path.with_extension("values");
if !values_path.exists() {
return Err(TranslationError::SourceNotFound(
values_path.display().to_string(),
));
}
log::info!(
"Verifying PathMap {:?} against ARTrie {:?}",
pathmap_path,
artrie_path
);
let trie: PersistentARTrieChar<u64> =
PersistentARTrieChar::open(artrie_path).map_err(|e| {
TranslationError::Io(std::io::Error::other(format!("Failed to open trie: {}", e)))
})?;
let mut source_entries: std::collections::HashMap<String, u64> =
std::collections::HashMap::new();
for (key, value) in trie.iter_with_values() {
if is_valid_entry(&key) {
source_entries.insert(key, value);
}
}
let source_count = source_entries.len() as u64;
log::info!("Source ARTrie has {} entries", source_count);
let paths_file = std::fs::File::open(pathmap_path)?;
let paths_reader = BufReader::new(paths_file);
let values_file = std::fs::File::open(&values_path)?;
let mut values_reader = BufReader::new(values_file);
let mut entries_verified = 0u64;
let mut mismatches = 0u64;
for_each_deserialized_path(paths_reader, |_idx, path| {
let mut value_bytes = [0u8; 8];
use std::io::Read;
values_reader.read_exact(&mut value_bytes)?;
let pathmap_value = u64::from_le_bytes(value_bytes);
let key = match std::str::from_utf8(path) {
Ok(s) => s.to_string(),
Err(_) => {
mismatches += 1;
log::warn!("PathMap contains invalid UTF-8 key at index {}", _idx);
return Ok(());
}
};
match source_entries.get(&key) {
Some(&source_value) => {
if source_value != pathmap_value {
mismatches += 1;
log::warn!(
"Value mismatch for key '{}': source={}, pathmap={}",
key,
source_value,
pathmap_value
);
}
}
None => {
mismatches += 1;
log::warn!("PathMap contains key not in source: '{}'", key);
}
}
entries_verified += 1;
Ok(())
})
.map_err(|e| TranslationError::Io(e))?;
if entries_verified != source_count {
log::warn!(
"Entry count mismatch: source={}, pathmap={}",
source_count,
entries_verified
);
let missing = source_count.saturating_sub(entries_verified);
mismatches += missing;
}
let verified = mismatches == 0;
log::info!(
"Verification {}: {} entries checked, {} mismatches",
if verified { "PASSED" } else { "FAILED" },
entries_verified,
mismatches
);
Ok(VerificationResult {
entries_verified,
mismatches,
verified,
})
}
}
#[derive(Clone, Debug)]
pub struct TranslationProgress {
pub phase: TranslationPhase,
pub entries_processed: u64,
pub entries_total: Option<u64>,
pub bytes_written: u64,
pub elapsed_seconds: f64,
}
#[derive(Clone, Debug, PartialEq)]
pub enum TranslationPhase {
Loading,
Iterating,
Building,
Merkleizing,
Saving,
Complete,
}
#[derive(Clone, Debug)]
pub struct VerificationResult {
pub entries_verified: u64,
pub mismatches: u64,
pub verified: bool,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_translation_stats_default() {
let stats = TranslationStats::default();
assert_eq!(stats.entries_translated, 0);
assert_eq!(stats.compression_ratio, 0.0);
}
#[test]
fn test_translation_source_not_found() {
let dir = tempdir().expect("Failed to create temp dir");
let output = dir.path().join("output.pathmap");
let result = PathMapTranslator::translate("/nonexistent/path.artrie", &output);
assert!(matches!(result, Err(TranslationError::SourceNotFound(_))));
}
#[test]
fn test_translation_progress() {
let progress = TranslationProgress {
phase: TranslationPhase::Building,
entries_processed: 10000,
entries_total: Some(100000),
bytes_written: 1024 * 1024,
elapsed_seconds: 5.5,
};
assert_eq!(progress.phase, TranslationPhase::Building);
assert_eq!(progress.entries_processed, 10000);
}
#[test]
fn test_verification_result() {
let result = VerificationResult {
entries_verified: 100000,
mismatches: 0,
verified: true,
};
assert!(result.verified);
assert_eq!(result.mismatches, 0);
}
#[test]
fn test_is_valid_entry() {
assert!(is_valid_entry("hello"));
assert!(is_valid_entry("the quick brown"));
assert!(is_valid_entry("café"));
assert!(!is_valid_entry("\x00metadata"));
assert!(!is_valid_entry("\x00__checkpoint__"));
}
}