#![warn(missing_docs)]
#![warn(unused_extern_crates)]
#![warn(unused_qualifications)]
pub mod traits;
pub mod dict;
pub mod index;
pub mod util;
pub use dict::{BatchOperations, DictLoader, MDict, StarDict, ZimDict};
pub use index::{btree, fts};
pub use traits::*;
pub use util::{buffer, compression, encoding, file_utils};
pub mod prelude {
pub use crate::dict::{utils as dict_utils, BatchOperations, DictLoader};
pub use crate::index::{btree::BTreeIndex, fts::FtsIndex};
pub use crate::traits::*;
pub use crate::util::{compression::CompressionAlgorithm, encoding::TextEncoding, DictConfig};
}
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub const NAME: &str = env!("CARGO_PKG_NAME");
pub const DESCRIPTION: &str = "High-performance dictionary utilities library";
pub const MAX_DICT_SIZE: u64 = 2_147_483_648;
pub const DEFAULT_CACHE_SIZE: usize = 1000;
pub const DEFAULT_BATCH_SIZE: usize = 100;
pub const MIN_MEMORY: u64 = 64 * 1024 * 1024;
pub const RECOMMENDED_MEMORY: u64 = 256 * 1024 * 1024;
#[cfg(feature = "cli")]
pub mod cli {
use crate::dict::{BatchOperations, DictLoader};
use crate::traits::*;
use std::path::PathBuf;
fn sanitize_output(input: &str) -> String {
let mut out = String::with_capacity(input.len());
for ch in input.chars() {
if ch.is_control() && ch != '\n' && ch != '\t' {
continue;
}
out.push(ch);
if out.len() >= 4096 {
break;
}
}
out
}
pub fn print_dict_info<P: AsRef<std::path::Path>>(path: P) -> Result<()> {
let path = path.as_ref();
let loader = DictLoader::new();
println!("Dictionary Information");
println!("===================");
println!("Path: {}", sanitize_output(&path.display().to_string()));
if let Ok(format) = loader.detect_format(path) {
println!("Format: {}", sanitize_output(&format));
}
if let Ok(mut dict) = loader.load(path) {
let metadata = dict.metadata();
println!("Name: {}", sanitize_output(&metadata.name));
println!("Version: {}", sanitize_output(&metadata.version));
println!("Entries: {}", metadata.entries);
println!("Size: {} bytes", metadata.file_size);
println!("Has B-TREE: {}", metadata.has_btree);
println!("Has FTS: {}", metadata.has_fts);
let stats = dict.stats();
println!("Memory Usage: {} bytes", stats.memory_usage);
for (index, size) in &stats.index_sizes {
println!("{} Index: {} bytes", sanitize_output(index), size);
}
}
Ok(())
}
pub fn search_dict<P: AsRef<std::path::Path>>(
path: P,
query: &str,
search_type: &str,
limit: Option<usize>,
) -> Result<()> {
let path = path.as_ref();
let loader = DictLoader::new();
let mut dict = loader.load(path)?;
println!("Search Results for '{}'", sanitize_output(query));
println!("===========================");
let results = match search_type {
"prefix" => dict.search_prefix(query, limit),
"fuzzy" => dict.search_fuzzy(query, None),
"fulltext" => {
let iterator = dict.search_fulltext(query)?;
let results_vec: Result<Vec<_>> = iterator.collect();
results_vec
}
_ => {
return Err(DictError::UnsupportedOperation(
"Search type must be 'prefix', 'fuzzy', or 'fulltext'".to_string(),
))
}
}?;
for result in results.iter().take(limit.unwrap_or(10)) {
println!("- {}", sanitize_output(&result.word));
if let Some(score) = result.score {
println!(" Score: {:.3}", score);
}
}
println!("\nFound {} results", results.len());
Ok(())
}
pub fn validate_dict<P: AsRef<std::path::Path>>(path: P) -> Result<()> {
let path = path.as_ref();
let loader = DictLoader::new();
println!("Validating dictionary: {}", path.display());
if !path.exists() {
return Err(DictError::FileNotFound(path.display().to_string()));
}
let format = loader.detect_format(path)?;
println!("Format detected: {}", format);
let mut dict = loader.load(path)?;
let stats = dict.stats();
println!("Validation Results:");
println!(" - Total entries: {}", stats.total_entries);
println!(" - Memory usage: {} bytes", stats.memory_usage);
println!(" - Cache hit rate: {:.2}%", stats.cache_hit_rate * 100.0);
println!(
" - B-TREE index: {}",
if dict.metadata().has_btree {
"Available"
} else {
"Not available"
}
);
println!(
" - FTS index: {}",
if dict.metadata().has_fts {
"Available"
} else {
"Not available"
}
);
println!("Dictionary validation: SUCCESS");
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::util::test_utils::{cleanup_temp_dir, generate_test_entries, temp_dir};
#[test]
fn test_version_info() {
assert!(!VERSION.is_empty());
assert!(!NAME.is_empty());
assert!(NAME == "dictutils");
}
#[test]
fn test_config_defaults() {
let config = DictConfig::default();
assert!(config.load_btree);
assert!(config.load_fts);
assert!(config.use_mmap);
assert_eq!(config.cache_size, DEFAULT_CACHE_SIZE);
assert_eq!(config.batch_size, DEFAULT_BATCH_SIZE);
}
#[test]
fn test_compression_algorithms() {
use crate::util::compression::*;
let test_data = b"Hello, World! This is test data for compression.";
for algorithm in &[CompressionAlgorithm::None, CompressionAlgorithm::Gzip] {
let compressed = compress(test_data, algorithm.clone()).unwrap();
let decompressed = decompress(&compressed, algorithm.clone()).unwrap();
assert_eq!(test_data, &decompressed[..]);
}
}
#[test]
fn test_encoding_detection() {
use crate::util::encoding::*;
let utf8_data = "Hello, World! 🌟".as_bytes();
let encoding = detect_encoding(utf8_data).unwrap();
assert_eq!(encoding, TextEncoding::Utf8);
let ascii_data = b"Hello, World!";
let encoding = detect_encoding(ascii_data).unwrap();
assert_eq!(encoding, TextEncoding::Utf8);
}
#[test]
fn test_test_utils() {
let entries = generate_test_entries(10);
assert_eq!(entries.len(), 10);
for (i, (key, content)) in entries.iter().enumerate() {
assert!(key.starts_with("word_"));
assert!(key.contains(&format!("{:06}", i)));
assert!(!content.is_empty());
}
}
#[test]
fn test_dict_loader() {
let loader = DictLoader::new();
let formats = loader.supported_formats();
assert!(formats.contains(&"mdict".to_string()));
assert!(formats.contains(&"stardict".to_string()));
assert!(formats.contains(&"zim".to_string()));
}
#[test]
fn test_performance_utils() {
use crate::util::performance::*;
let mut profiler = Profiler::new();
for i in 0..1000 {
profiler.record("test_operation", 1);
black_box(i);
}
let ops_per_sec = profiler.operations_per_second("test_operation");
assert!(ops_per_sec > 0.0);
}
fn black_box<T>(x: T) -> T {
std::hint::black_box(x)
}
}
#[cfg(all(test, feature = "bench"))]
mod benchmarks {
use super::*;
use crate::util::test_utils::{cleanup_temp_dir, generate_test_entries, temp_dir};
use std::time::Instant;
#[bench]
fn bench_binary_search(b: &mut test::Bencher) {
let entries = generate_test_entries(1000);
let keys: Vec<String> = entries.iter().map(|(k, _)| k.clone()).collect();
b.iter(|| {
test::black_box(keys.binary_search(&"word_00500".to_string()));
});
}
#[bench]
fn bench_prefix_search(b: &mut test::Bencher) {
let config = DictConfig::default();
let temp_path = temp_dir().unwrap();
let entries = generate_test_entries(1000);
b.iter(|| {
test::black_box(&entries);
});
let _ = cleanup_temp_dir(&temp_path);
}
#[bench]
fn bench_fuzzy_search(b: &mut test::Bencher) {
let entries = generate_test_entries(1000);
let query = "word_500";
b.iter(|| {
let mut results = Vec::new();
for (key, _) in &entries {
if let Some(_distance) = levenshtein_approx(query, key, 2) {
results.push(key);
if results.len() >= 10 {
break;
}
}
}
test::black_box(results);
});
}
fn levenshtein_approx(a: &str, b: &str, max_dist: usize) -> Option<usize> {
let (m, n) = (a.len(), b.len());
if (m as i32 - n as i32).abs() > max_dist as i32 {
return None;
}
let mut dp = vec![vec![0u32; n + 1]; m + 1];
for i in 0..=m {
dp[i][0] = i as u32;
}
for j in 0..=n {
dp[0][j] = j as u32;
}
for i in 1..=m {
for j in 1..=n {
if a.chars().nth(i - 1) == b.chars().nth(j - 1) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
}
}
}
let distance = dp[m][n] as usize;
if distance <= max_dist {
Some(distance)
} else {
None
}
}
}