use crate::deduplication::DeduplicationPolicy;
use crate::error::ShardexError;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SlopFactorConfig {
pub default_factor: usize,
pub min_factor: usize,
pub max_factor: usize,
pub adaptive_enabled: bool,
pub performance_threshold_ms: u64,
}
impl Default for SlopFactorConfig {
fn default() -> Self {
Self {
default_factor: 3,
min_factor: 1,
max_factor: 100,
adaptive_enabled: false,
performance_threshold_ms: 100,
}
}
}
impl SlopFactorConfig {
pub fn new() -> Self {
Self::default()
}
pub fn default_factor(mut self, factor: usize) -> Self {
self.default_factor = factor;
self
}
pub fn min_factor(mut self, factor: usize) -> Self {
self.min_factor = factor;
self
}
pub fn max_factor(mut self, factor: usize) -> Self {
self.max_factor = factor;
self
}
pub fn adaptive_enabled(mut self, enabled: bool) -> Self {
self.adaptive_enabled = enabled;
self
}
pub fn performance_threshold_ms(mut self, threshold_ms: u64) -> Self {
self.performance_threshold_ms = threshold_ms;
self
}
pub fn validate(&self) -> Result<(), ShardexError> {
if self.default_factor == 0 {
return Err(ShardexError::config_error(
"slop_factor_config.default_factor",
"must be greater than 0",
"Set default_factor to a positive integer (recommended: 3-10 for most use cases)",
));
}
if self.min_factor == 0 {
return Err(ShardexError::config_error(
"slop_factor_config.min_factor",
"must be greater than 0",
"Set min_factor to at least 1 (minimum valid slop factor)",
));
}
if self.max_factor == 0 {
return Err(ShardexError::config_error(
"slop_factor_config.max_factor",
"must be greater than 0",
"Set max_factor to a reasonable upper bound (recommended: 100 or less to avoid performance issues)",
));
}
if self.min_factor > self.max_factor {
return Err(ShardexError::config_error(
"slop_factor_config",
format!(
"min_factor ({}) cannot be greater than max_factor ({})",
self.min_factor, self.max_factor
),
"Ensure min_factor <= max_factor. For example: min_factor=1, max_factor=10",
));
}
if self.default_factor < self.min_factor || self.default_factor > self.max_factor {
return Err(ShardexError::config_error(
"slop_factor_config.default_factor",
format!(
"value {} is outside the allowed range [{}, {}]",
self.default_factor, self.min_factor, self.max_factor
),
format!(
"Set default_factor to a value between {} and {}",
self.min_factor, self.max_factor
),
));
}
if self.performance_threshold_ms == 0 {
return Err(ShardexError::config_error(
"slop_factor_config.performance_threshold_ms",
"must be greater than 0",
"Set performance_threshold_ms to a positive value in milliseconds (recommended: 50-200ms)",
));
}
Ok(())
}
pub fn build(self) -> Result<Self, ShardexError> {
self.validate()?;
Ok(self)
}
pub fn calculate_optimal_slop(&self, vector_size: usize, shard_count: usize) -> usize {
if !self.adaptive_enabled {
return self.default_factor;
}
let size_factor = ((vector_size as f64).log2() / 10.0).max(0.1) as usize;
let shard_factor = (shard_count as f64 / 10.0).sqrt().max(1.0) as usize;
let calculated = self.default_factor + size_factor + shard_factor;
calculated.clamp(self.min_factor, self.max_factor)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ShardexConfig {
pub directory_path: PathBuf,
pub vector_size: usize,
pub shard_size: usize,
pub shardex_segment_size: usize,
pub wal_segment_size: usize,
pub wal_safety_margin: f32,
pub batch_write_interval_ms: u64,
pub slop_factor_config: SlopFactorConfig,
pub bloom_filter_size: usize,
pub deduplication_policy: DeduplicationPolicy,
pub max_document_text_size: usize,
}
impl Default for ShardexConfig {
fn default() -> Self {
Self {
directory_path: PathBuf::from("./shardex_index"),
vector_size: 384,
shard_size: 10000,
shardex_segment_size: 1000,
wal_segment_size: 1024 * 1024, wal_safety_margin: 0.5, batch_write_interval_ms: 100,
slop_factor_config: SlopFactorConfig::default(),
bloom_filter_size: 1024,
deduplication_policy: DeduplicationPolicy::default(),
max_document_text_size: 10 * 1024 * 1024, }
}
}
impl ShardexConfig {
pub fn new() -> Self {
Self::default()
}
pub fn directory_path<P: Into<PathBuf>>(mut self, path: P) -> Self {
self.directory_path = path.into();
self
}
pub fn vector_size(mut self, size: usize) -> Self {
self.vector_size = size;
self
}
pub fn shard_size(mut self, size: usize) -> Self {
self.shard_size = size;
self
}
pub fn shardex_segment_size(mut self, size: usize) -> Self {
self.shardex_segment_size = size;
self
}
pub fn wal_segment_size(mut self, size: usize) -> Self {
self.wal_segment_size = size;
self
}
pub fn wal_safety_margin(mut self, margin: f32) -> Self {
self.wal_safety_margin = margin.clamp(0.0, 1.0);
self
}
pub fn batch_write_interval_ms(mut self, ms: u64) -> Self {
self.batch_write_interval_ms = ms;
self
}
pub fn default_slop_factor(mut self, factor: usize) -> Self {
self.slop_factor_config.default_factor = factor;
self
}
pub fn slop_factor_config(mut self, config: SlopFactorConfig) -> Self {
self.slop_factor_config = config;
self
}
pub fn bloom_filter_size(mut self, size: usize) -> Self {
self.bloom_filter_size = size;
self
}
pub fn deduplication_policy(mut self, policy: DeduplicationPolicy) -> Self {
self.deduplication_policy = policy;
self
}
pub fn max_document_text_size(mut self, size: usize) -> Self {
self.max_document_text_size = size;
self
}
pub fn validate(&self) -> Result<(), ShardexError> {
if self.vector_size == 0 {
return Err(ShardexError::config_error(
"vector_size",
"must be greater than 0",
"Set vector_size to match your embedding model dimensions (e.g., 384 for sentence transformers, 1536 for OpenAI embeddings)",
));
}
if self.vector_size > 10000 {
return Err(ShardexError::config_error(
"vector_size",
format!(
"value {} is unusually large and may cause performance issues",
self.vector_size
),
"Most embedding models use 384-1536 dimensions. Verify this matches your model's output size.",
));
}
if self.shard_size == 0 {
return Err(ShardexError::config_error(
"shard_size",
"must be greater than 0",
"Set shard_size to control how many vectors per shard (recommended: 10000-100000 depending on memory constraints)",
));
}
if self.shard_size > 1_000_000 {
return Err(ShardexError::config_error(
"shard_size",
format!("value {} may cause excessive memory usage", self.shard_size),
"Consider reducing shard_size to 100000 or less to avoid memory issues",
));
}
if self.shardex_segment_size == 0 {
return Err(ShardexError::config_error(
"shardex_segment_size",
"must be greater than 0",
"Set shardex_segment_size to control file segment sizes (recommended: 64MB-1GB)",
));
}
if self.wal_segment_size < 1024 {
return Err(ShardexError::config_error(
"wal_segment_size",
format!(
"value {} bytes is too small for efficient WAL operations",
self.wal_segment_size
),
"Set wal_segment_size to at least 1024 bytes (recommended: 1MB-64MB)",
));
}
if self.wal_segment_size > 1024 * 1024 * 1024 {
return Err(ShardexError::config_error(
"wal_segment_size",
format!("value {} bytes exceeds 1GB limit", self.wal_segment_size),
"Set wal_segment_size to 1GB or less to avoid memory and disk space issues",
));
}
if self.wal_safety_margin < 0.0 || self.wal_safety_margin > 1.0 {
return Err(ShardexError::config_error(
"wal_safety_margin",
format!("value {} must be between 0.0 and 1.0", self.wal_safety_margin),
"Set wal_safety_margin between 0.0 (no safety margin) and 1.0 (100% margin). Recommended: 0.5 (50%)",
));
}
if self.batch_write_interval_ms == 0 {
return Err(ShardexError::config_error(
"batch_write_interval_ms",
"must be greater than 0",
"Set batch_write_interval_ms to control how often batches are flushed (recommended: 100-1000ms)",
));
}
if self.batch_write_interval_ms > 30000 {
return Err(ShardexError::config_error(
"batch_write_interval_ms",
format!(
"value {} ms is too large and may cause data loss on crashes",
self.batch_write_interval_ms
),
"Set batch_write_interval_ms to 30 seconds or less to limit potential data loss",
));
}
self.slop_factor_config.validate()?;
if self.bloom_filter_size == 0 {
return Err(ShardexError::config_error(
"bloom_filter_size",
"must be greater than 0",
"Set bloom_filter_size in bits (recommended: 1000000 for ~100k vectors with 1% false positive rate)",
));
}
const MIN_DOCUMENT_SIZE: usize = 1024; const MAX_DOCUMENT_SIZE: usize = 1024 * 1024 * 1024;
if self.max_document_text_size > 0 && self.max_document_text_size < MIN_DOCUMENT_SIZE {
return Err(ShardexError::config_error(
"max_document_text_size",
format!(
"Size {} bytes is below minimum {}",
self.max_document_text_size, MIN_DOCUMENT_SIZE
),
format!(
"Set max_document_text_size to 0 to disable text storage or at least {} bytes to enable",
MIN_DOCUMENT_SIZE
),
));
}
if self.max_document_text_size > MAX_DOCUMENT_SIZE {
return Err(ShardexError::config_error(
"max_document_text_size",
format!(
"Size {} bytes exceeds maximum {}",
self.max_document_text_size, MAX_DOCUMENT_SIZE
),
format!("Set max_document_text_size to at most {} bytes", MAX_DOCUMENT_SIZE),
));
}
if self.directory_path.as_os_str().is_empty() {
return Err(ShardexError::Config("Directory path cannot be empty".to_string()));
}
Ok(())
}
pub fn build(self) -> Result<Self, ShardexError> {
self.validate()?;
Ok(self)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = ShardexConfig::default();
assert_eq!(config.directory_path, PathBuf::from("./shardex_index"));
assert_eq!(config.vector_size, 384);
assert_eq!(config.shard_size, 10000);
assert_eq!(config.shardex_segment_size, 1000);
assert_eq!(config.wal_segment_size, 1024 * 1024);
assert_eq!(config.batch_write_interval_ms, 100);
assert_eq!(config.slop_factor_config.default_factor, 3);
assert_eq!(config.bloom_filter_size, 1024);
assert_eq!(config.max_document_text_size, 10 * 1024 * 1024);
}
#[test]
fn test_new_config() {
let config = ShardexConfig::new();
assert_eq!(config, ShardexConfig::default());
}
#[test]
fn test_builder_pattern() {
let config = ShardexConfig::new()
.directory_path("/tmp/test_index")
.vector_size(512)
.shard_size(5000)
.shardex_segment_size(500)
.wal_segment_size(2048)
.batch_write_interval_ms(200)
.default_slop_factor(5)
.bloom_filter_size(2048)
.max_document_text_size(20 * 1024 * 1024);
assert_eq!(config.directory_path, PathBuf::from("/tmp/test_index"));
assert_eq!(config.vector_size, 512);
assert_eq!(config.shard_size, 5000);
assert_eq!(config.shardex_segment_size, 500);
assert_eq!(config.wal_segment_size, 2048);
assert_eq!(config.batch_write_interval_ms, 200);
assert_eq!(config.slop_factor_config.default_factor, 5);
assert_eq!(config.bloom_filter_size, 2048);
assert_eq!(config.max_document_text_size, 20 * 1024 * 1024);
}
#[test]
fn test_default_config_validation() {
let config = ShardexConfig::default();
assert!(config.validate().is_ok());
}
#[test]
fn test_build_with_valid_config() {
let config = ShardexConfig::new()
.vector_size(256)
.shard_size(1000)
.build();
assert!(config.is_ok());
}
#[test]
fn test_zero_vector_size_validation() {
let config = ShardexConfig::new().vector_size(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"vector_size - must be greater than 0: Set vector_size to match your embedding model dimensions (e.g., 384 for sentence transformers, 1536 for OpenAI embeddings)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_shard_size_validation() {
let config = ShardexConfig::new().shard_size(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"shard_size - must be greater than 0: Set shard_size to control how many vectors per shard (recommended: 10000-100000 depending on memory constraints)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_shardex_segment_size_validation() {
let config = ShardexConfig::new().shardex_segment_size(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"shardex_segment_size - must be greater than 0: Set shardex_segment_size to control file segment sizes (recommended: 64MB-1GB)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_wal_segment_size_too_small_validation() {
let config = ShardexConfig::new().wal_segment_size(512);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"wal_segment_size - value 512 bytes is too small for efficient WAL operations: Set wal_segment_size to at least 1024 bytes (recommended: 1MB-64MB)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_wal_segment_size_too_large_validation() {
let config = ShardexConfig::new().wal_segment_size(2 * 1024 * 1024 * 1024);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"wal_segment_size - value 2147483648 bytes exceeds 1GB limit: Set wal_segment_size to 1GB or less to avoid memory and disk space issues"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_batch_write_interval_validation() {
let config = ShardexConfig::new().batch_write_interval_ms(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"batch_write_interval_ms - must be greater than 0: Set batch_write_interval_ms to control how often batches are flushed (recommended: 100-1000ms)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_slop_factor_validation() {
let config = ShardexConfig::new().default_slop_factor(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.default_factor - must be greater than 0: Set default_factor to a positive integer (recommended: 3-10 for most use cases)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_bloom_filter_size_validation() {
let config = ShardexConfig::new().bloom_filter_size(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"bloom_filter_size - must be greater than 0: Set bloom_filter_size in bits (recommended: 1000000 for ~100k vectors with 1% false positive rate)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_empty_directory_path_validation() {
let config = ShardexConfig {
directory_path: PathBuf::new(),
..Default::default()
};
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(msg, "Directory path cannot be empty");
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_zero_max_document_text_size_validation() {
let config = ShardexConfig::new().max_document_text_size(0);
let result = config.validate();
assert!(result.is_ok());
}
#[test]
fn test_max_document_text_size_below_minimum_validation() {
let config = ShardexConfig::new().max_document_text_size(512);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"max_document_text_size - Size 512 bytes is below minimum 1024: Set max_document_text_size to 0 to disable text storage or at least 1024 bytes to enable"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_max_document_text_size_at_minimum_boundary() {
let config = ShardexConfig::new().max_document_text_size(1024);
let result = config.validate();
assert!(result.is_ok());
}
#[test]
fn test_max_document_text_size_at_maximum_boundary() {
let config = ShardexConfig::new().max_document_text_size(1024 * 1024 * 1024);
let result = config.validate();
assert!(result.is_ok());
}
#[test]
fn test_max_document_text_size_too_large_validation() {
let config = ShardexConfig::new().max_document_text_size(2 * 1024 * 1024 * 1024);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"max_document_text_size - Size 2147483648 bytes exceeds maximum 1073741824: Set max_document_text_size to at most 1073741824 bytes"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_build_with_invalid_config() {
let config = ShardexConfig::new().vector_size(0);
let result = config.build();
assert!(result.is_err());
}
#[test]
fn test_config_clone() {
let config1 = ShardexConfig::new().vector_size(256);
let config2 = config1.clone();
assert_eq!(config1, config2);
}
#[test]
fn test_config_debug() {
let config = ShardexConfig::new();
let debug_str = format!("{:?}", config);
assert!(debug_str.contains("ShardexConfig"));
assert!(debug_str.contains("directory_path"));
assert!(debug_str.contains("vector_size"));
}
#[test]
fn test_pathbuf_conversion() {
let config = ShardexConfig::new().directory_path("/home/user/index");
assert_eq!(config.directory_path, PathBuf::from("/home/user/index"));
let pathbuf = PathBuf::from("/var/lib/shardex");
let config = ShardexConfig::new().directory_path(pathbuf.clone());
assert_eq!(config.directory_path, pathbuf);
}
#[test]
fn test_boundary_values() {
let config = ShardexConfig::new().wal_segment_size(1024);
assert!(config.validate().is_ok());
let config = ShardexConfig::new().wal_segment_size(1024 * 1024 * 1024);
assert!(config.validate().is_ok());
}
#[test]
fn test_slop_factor_config_default() {
let config = SlopFactorConfig::default();
assert_eq!(config.default_factor, 3);
assert_eq!(config.min_factor, 1);
assert_eq!(config.max_factor, 100);
assert!(!config.adaptive_enabled);
assert_eq!(config.performance_threshold_ms, 100);
}
#[test]
fn test_slop_factor_config_new() {
let config = SlopFactorConfig::new();
assert_eq!(config, SlopFactorConfig::default());
}
#[test]
fn test_slop_factor_config_builder() {
let config = SlopFactorConfig::new()
.default_factor(5)
.min_factor(2)
.max_factor(50)
.adaptive_enabled(true)
.performance_threshold_ms(200);
assert_eq!(config.default_factor, 5);
assert_eq!(config.min_factor, 2);
assert_eq!(config.max_factor, 50);
assert!(config.adaptive_enabled);
assert_eq!(config.performance_threshold_ms, 200);
}
#[test]
fn test_slop_factor_config_validation() {
let config = SlopFactorConfig::default();
assert!(config.validate().is_ok());
}
#[test]
fn test_slop_factor_config_zero_default_validation() {
let config = SlopFactorConfig::new().default_factor(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.default_factor - must be greater than 0: Set default_factor to a positive integer (recommended: 3-10 for most use cases)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_zero_min_validation() {
let config = SlopFactorConfig::new().min_factor(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.min_factor - must be greater than 0: Set min_factor to at least 1 (minimum valid slop factor)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_zero_max_validation() {
let config = SlopFactorConfig::new().max_factor(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.max_factor - must be greater than 0: Set max_factor to a reasonable upper bound (recommended: 100 or less to avoid performance issues)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_min_greater_than_max_validation() {
let config = SlopFactorConfig::new().min_factor(10).max_factor(5);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config - min_factor (10) cannot be greater than max_factor (5): Ensure min_factor <= max_factor. For example: min_factor=1, max_factor=10"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_default_out_of_range_validation() {
let config = SlopFactorConfig::new()
.default_factor(10)
.min_factor(1)
.max_factor(5);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.default_factor - value 10 is outside the allowed range [1, 5]: Set default_factor to a value between 1 and 5"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_zero_performance_threshold_validation() {
let config = SlopFactorConfig::new().performance_threshold_ms(0);
let result = config.validate();
assert!(result.is_err());
if let Err(ShardexError::Config(msg)) = result {
assert_eq!(
msg,
"slop_factor_config.performance_threshold_ms - must be greater than 0: Set performance_threshold_ms to a positive value in milliseconds (recommended: 50-200ms)"
);
} else {
panic!("Expected Config error");
}
}
#[test]
fn test_slop_factor_config_build_valid() {
let config = SlopFactorConfig::new().default_factor(5).build();
assert!(config.is_ok());
}
#[test]
fn test_slop_factor_config_build_invalid() {
let config = SlopFactorConfig::new().default_factor(0).build();
assert!(config.is_err());
}
#[test]
fn test_calculate_optimal_slop_adaptive_disabled() {
let config = SlopFactorConfig::new()
.default_factor(5)
.adaptive_enabled(false);
let result = config.calculate_optimal_slop(384, 10);
assert_eq!(result, 5);
}
#[test]
fn test_calculate_optimal_slop_adaptive_enabled() {
let config = SlopFactorConfig::new()
.default_factor(3)
.min_factor(1)
.max_factor(10)
.adaptive_enabled(true);
let result = config.calculate_optimal_slop(384, 10);
assert!((1..=10).contains(&result));
let result_large = config.calculate_optimal_slop(1024, 10);
assert!(result_large >= result);
}
#[test]
fn test_calculate_optimal_slop_clamping() {
let config = SlopFactorConfig::new()
.default_factor(50)
.min_factor(40)
.max_factor(60)
.adaptive_enabled(true);
let result = config.calculate_optimal_slop(128, 5);
assert!((40..=60).contains(&result));
}
#[test]
fn test_shardex_config_with_slop_factor_config() {
let slop_config = SlopFactorConfig::new()
.default_factor(5)
.adaptive_enabled(true);
let config = ShardexConfig::new().slop_factor_config(slop_config);
assert_eq!(config.slop_factor_config.default_factor, 5);
assert!(config.slop_factor_config.adaptive_enabled);
}
#[test]
fn test_slop_factor_config_clone() {
let config1 = SlopFactorConfig::new().default_factor(7);
let config2 = config1.clone();
assert_eq!(config1, config2);
}
#[test]
fn test_slop_factor_config_debug() {
let config = SlopFactorConfig::new();
let debug_str = format!("{:?}", config);
assert!(debug_str.contains("SlopFactorConfig"));
assert!(debug_str.contains("default_factor"));
assert!(debug_str.contains("adaptive_enabled"));
}
}