#![allow(dead_code)]
use shardex::document_text_storage::DocumentTextStorage;
use shardex::error::ShardexError;
use shardex::identifiers::DocumentId;
use shardex::structures::Posting;
use std::time::{Duration, Instant};
use tempfile::TempDir;
pub struct DocumentTestEnvironment {
pub temp_dir: TempDir,
pub storage: DocumentTextStorage,
pub test_name: String,
}
impl DocumentTestEnvironment {
pub fn new(test_name: &str, max_document_size: usize) -> Self {
let temp_dir =
TempDir::new().unwrap_or_else(|e| panic!("Failed to create temp dir for test {}: {}", test_name, e));
let storage = DocumentTextStorage::create(&temp_dir, max_document_size)
.unwrap_or_else(|e| panic!("Failed to create storage for test {}: {}", test_name, e));
Self {
temp_dir,
storage,
test_name: test_name.to_string(),
}
}
pub fn new_default(test_name: &str) -> Self {
Self::new(test_name, 10 * 1024 * 1024)
}
pub fn new_small(test_name: &str) -> Self {
Self::new(test_name, 1024) }
pub fn storage(&mut self) -> &mut DocumentTextStorage {
&mut self.storage
}
pub fn storage_ref(&self) -> &DocumentTextStorage {
&self.storage
}
pub fn reopen(&mut self) {
self.storage = DocumentTextStorage::open(&self.temp_dir)
.unwrap_or_else(|e| panic!("Failed to reopen storage for test {}: {}", self.test_name, e));
}
}
pub struct TextGenerator {
rng_state: u32,
word_list: Vec<&'static str>,
}
impl Default for TextGenerator {
fn default() -> Self {
Self::new()
}
}
impl TextGenerator {
pub fn new() -> Self {
Self {
rng_state: 12345,
word_list: vec![
"the",
"quick",
"brown",
"fox",
"jumps",
"over",
"lazy",
"dog",
"Lorem",
"ipsum",
"dolor",
"sit",
"amet",
"consectetur",
"adipiscing",
"elit",
"sed",
"do",
"eiusmod",
"tempor",
"incididunt",
"ut",
"labore",
"et",
"dolore",
"magna",
"aliqua",
"Ut",
"enim",
"ad",
"minim",
"veniam",
"quis",
"nostrud",
"exercitation",
"ullamco",
"laboris",
"nisi",
"performance",
"testing",
"document",
"storage",
"retrieval",
"system",
"database",
"memory",
"mapping",
"efficient",
"scalable",
"robust",
"implementation",
"algorithm",
"optimization",
"thread",
"safety",
"concurrency",
"parallel",
"distributed",
"architecture",
"design",
],
}
}
pub fn generate_text(&mut self, word_count: usize) -> String {
let mut text = String::with_capacity(word_count * 8);
for i in 0..word_count {
if i > 0 {
text.push(' ');
}
self.rng_state = self
.rng_state
.wrapping_mul(1664525)
.wrapping_add(1013904223);
let word_index = (self.rng_state as usize) % self.word_list.len();
text.push_str(self.word_list[word_index]);
}
text
}
pub fn generate_structured_text(
&mut self,
paragraphs: usize,
sentences_per_paragraph: usize,
words_per_sentence: usize,
) -> String {
let mut text = String::new();
for p in 0..paragraphs {
if p > 0 {
text.push_str("\n\n");
}
for s in 0..sentences_per_paragraph {
if s > 0 {
text.push(' ');
}
let sentence = self.generate_text(words_per_sentence);
text.push_str(&sentence);
text.push('.');
}
}
text
}
pub fn generate_unicode_text(&mut self, language_type: UnicodeTestType) -> String {
match language_type {
UnicodeTestType::Chinese => {
"你好世界!这是一个中文测试文档,用于验证UTF-8编码的正确处理。我们需要确保中文字符能够正确存储和检索。"
.to_string()
}
UnicodeTestType::Japanese => {
"こんにちは世界!これは日本語のテスト文書です。UTF-8エンコーディングが正しく動作することを確認します。"
.to_string()
}
UnicodeTestType::Arabic => {
"مرحبا بالعالم! هذا مستند اختبار باللغة العربية للتحقق من التعامل الصحيح مع ترميز UTF-8.".to_string()
}
UnicodeTestType::Emoji => {
"Hello 🌍 World! 🚀 This document contains emojis 🎉✨ to test Unicode handling 🌟💫".to_string()
}
UnicodeTestType::Mixed => {
let base_text = self.generate_text(20);
format!("{} 中文 🌍 العربية 🚀 日本語 ✨ Mixed content! 🎉", base_text)
}
UnicodeTestType::ControlCharacters => {
"Text with\ttabs\nand\rcarriage\x01returns\x02and\x03control\x1fcharacters.".to_string()
}
}
}
pub fn generate_boundary_test_text(&mut self) -> String {
"Héllo Wörld! 🌍🚀 Üñíçødé tëst with multibyte characters é ñ ü".to_string()
}
pub fn generate_sized_text(&mut self, target_bytes: usize) -> String {
let base_text = self.generate_text(100);
let base_len = base_text.len();
if target_bytes <= base_len {
return base_text[..target_bytes].to_string();
}
let repetitions = (target_bytes + base_len - 1) / base_len; let mut result = base_text.repeat(repetitions);
result.truncate(target_bytes);
result
}
}
pub enum UnicodeTestType {
Chinese,
Japanese,
Arabic,
Emoji,
Mixed,
ControlCharacters,
}
pub struct PostingGenerator {
rng_state: u32,
}
impl Default for PostingGenerator {
fn default() -> Self {
Self::new()
}
}
impl PostingGenerator {
pub fn new() -> Self {
Self { rng_state: 54321 }
}
pub fn generate_postings(
&mut self,
document_id: DocumentId,
text: &str,
vector_dimension: usize,
posting_count: usize,
) -> Result<Vec<Posting>, ShardexError> {
if posting_count == 0 {
return Ok(Vec::new());
}
let mut postings = Vec::new();
let text_len = text.len();
for i in 0..posting_count {
let segment_size = text_len / posting_count;
let start = (i * segment_size) as u32;
let remaining = text_len - (i * segment_size);
let max_length = std::cmp::min(segment_size, remaining);
let length = max_length.clamp(1, 100) as u32;
let (adjusted_start, adjusted_length) = self.adjust_for_utf8_boundaries(text, start, length);
if adjusted_length == 0 {
continue; }
let vector = self.generate_vector(vector_dimension, i);
let posting = Posting::new(document_id, adjusted_start, adjusted_length, vector, vector_dimension)?;
postings.push(posting);
}
Ok(postings)
}
pub fn generate_word_based_postings(
&mut self,
document_id: DocumentId,
text: &str,
vector_dimension: usize,
) -> Result<Vec<Posting>, ShardexError> {
let words: Vec<(usize, &str)> = text
.split_whitespace()
.enumerate()
.map(|(i, word)| {
let start_pos = text[..].find(word).unwrap_or(0) + i * " ".len();
(start_pos, word)
})
.collect();
let mut postings = Vec::new();
for (i, (start_pos, word)) in words.iter().enumerate() {
if word.is_empty() {
continue;
}
let vector = self.generate_vector(vector_dimension, i);
let posting = Posting::new(
document_id,
*start_pos as u32,
word.len() as u32,
vector,
vector_dimension,
)?;
postings.push(posting);
}
Ok(postings)
}
pub fn generate_overlapping_postings(
&mut self,
document_id: DocumentId,
text: &str,
vector_dimension: usize,
posting_count: usize,
) -> Result<Vec<Posting>, ShardexError> {
let mut postings = Vec::new();
let text_len = text.len();
if text_len == 0 || posting_count == 0 {
return Ok(postings);
}
for i in 0..posting_count {
let start = (i * text_len / (posting_count + 1)) as u32;
let length = std::cmp::min(50, text_len - start as usize) as u32;
if length == 0 {
break;
}
let (adjusted_start, adjusted_length) = self.adjust_for_utf8_boundaries(text, start, length);
if adjusted_length > 0 {
let vector = self.generate_vector(vector_dimension, i);
let posting = Posting::new(document_id, adjusted_start, adjusted_length, vector, vector_dimension)?;
postings.push(posting);
}
}
Ok(postings)
}
fn generate_vector(&mut self, dimension: usize, seed_modifier: usize) -> Vec<f32> {
let mut vector = Vec::with_capacity(dimension);
for _i in 0..dimension {
self.rng_state = self
.rng_state
.wrapping_mul(1664525)
.wrapping_add(1013904223);
let value = (self.rng_state.wrapping_add(seed_modifier as u32) as f32) / (u32::MAX as f32);
vector.push(value);
}
vector
}
fn adjust_for_utf8_boundaries(&self, text: &str, start: u32, length: u32) -> (u32, u32) {
let start_usize = start as usize;
let length_usize = length as usize;
if start_usize >= text.len() {
return (start, 0);
}
let mut adjusted_start = start_usize;
while adjusted_start > 0 && !text.is_char_boundary(adjusted_start) {
adjusted_start -= 1;
}
let end_usize = std::cmp::min(adjusted_start + length_usize, text.len());
let mut adjusted_end = end_usize;
while adjusted_end > adjusted_start && !text.is_char_boundary(adjusted_end) {
adjusted_end -= 1;
}
let adjusted_length = adjusted_end - adjusted_start;
(adjusted_start as u32, adjusted_length as u32)
}
}
pub struct PerformanceTracker {
measurements: Vec<(String, Duration)>,
}
impl Default for PerformanceTracker {
fn default() -> Self {
Self::new()
}
}
impl PerformanceTracker {
pub fn new() -> Self {
Self {
measurements: Vec::new(),
}
}
pub fn measure<F, R>(&mut self, operation_name: &str, operation: F) -> R
where
F: FnOnce() -> R,
{
let start = Instant::now();
let result = operation();
let duration = start.elapsed();
self.measurements
.push((operation_name.to_string(), duration));
result
}
pub fn last_measurement(&self) -> Option<&(String, Duration)> {
self.measurements.last()
}
pub fn measurements(&self) -> &[(String, Duration)] {
&self.measurements
}
pub fn total_time(&self) -> Duration {
self.measurements
.iter()
.map(|(_, duration)| *duration)
.sum()
}
pub fn assert_last_within(&self, max_duration: Duration) {
if let Some((name, duration)) = self.last_measurement() {
assert!(
*duration <= max_duration,
"Operation '{}' took {:?}, expected <= {:?}",
name,
duration,
max_duration
);
} else {
assert!(!self.measurements.is_empty(), "No measurements recorded");
}
}
pub fn print_summary(&self) {
println!("Performance Summary:");
for (name, duration) in &self.measurements {
println!(" {}: {:?}", name, duration);
}
println!(" Total: {:?}", self.total_time());
}
}
pub struct ErrorTestHelper;
impl ErrorTestHelper {
pub fn assert_error_type<T: std::fmt::Debug>(result: Result<T, ShardexError>, expected_type: ErrorType) {
assert!(result.is_err(), "Expected error, got success");
let error = result.unwrap_err();
match (expected_type, &error) {
(ErrorType::InvalidRange, ShardexError::InvalidRange { .. }) => {}
(ErrorType::DocumentTooLarge, ShardexError::DocumentTooLarge { .. }) => {}
(ErrorType::DocumentTextNotFound, ShardexError::DocumentTextNotFound { .. }) => {}
(ErrorType::InvalidInput, ShardexError::InvalidInput { .. }) => {}
(ErrorType::TextCorruption, ShardexError::TextCorruption(_)) => {}
_ => panic!("Expected error type {:?}, got {:?}", expected_type, error),
}
}
pub fn expect_error<F, T>(operation: F, expected_type: ErrorType)
where
F: FnOnce() -> Result<T, ShardexError>,
T: std::fmt::Debug,
{
Self::assert_error_type(operation(), expected_type);
}
pub fn generate_oversized_text(size_limit: usize) -> String {
"x".repeat(size_limit + 100)
}
pub fn generate_problematic_text(problem_type: ProblematicTextType) -> String {
match problem_type {
ProblematicTextType::NullBytes => "Hello\x00World\x00Test".to_string(),
ProblematicTextType::ControlCharacters => "Text\x01with\x02control\x03chars".to_string(),
ProblematicTextType::VeryLong => "A".repeat(1_000_000),
}
}
}
#[derive(Debug, Clone, Copy)]
pub enum ErrorType {
InvalidRange,
DocumentTooLarge,
DocumentTextNotFound,
InvalidInput,
TextCorruption,
}
#[derive(Debug, Clone, Copy)]
pub enum ProblematicTextType {
NullBytes,
ControlCharacters,
VeryLong,
}
pub struct ValidationHelper;
impl ValidationHelper {
pub fn validate_storage_consistency(storage: &DocumentTextStorage) {
assert!(
storage.utilization_ratio() >= 0.0,
"Utilization ratio cannot be negative"
);
assert!(
storage.utilization_ratio() <= 1.0,
"Utilization ratio cannot exceed 1.0"
);
if storage.entry_count() == 0 {
assert!(storage.is_empty(), "Empty storage should report as empty");
assert_eq!(storage.total_text_size(), 0, "Empty storage should have zero text size");
} else {
assert!(!storage.is_empty(), "Non-empty storage should not report as empty");
}
}
pub fn validate_text_extraction(
storage: &DocumentTextStorage,
_doc_id: DocumentId,
expected_full_text: &str,
start: u32,
length: u32,
) {
let extracted = storage
.extract_text_substring(_doc_id, start, length)
.unwrap();
let expected_start = start as usize;
let expected_end = expected_start + length as usize;
assert!(
expected_end <= expected_full_text.len(),
"Test parameters exceed document bounds"
);
let expected = &expected_full_text[expected_start..expected_end];
assert_eq!(
extracted, expected,
"Extracted text doesn't match expected for range {}..{}",
expected_start, expected_end
);
}
pub fn validate_postings_extraction(
storage: &DocumentTextStorage,
_doc_id: DocumentId,
expected_text: &str,
postings: &[Posting],
) {
for (i, posting) in postings.iter().enumerate() {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap_or_else(|e| {
panic!("Failed to extract posting {}: {:?}", i, e);
});
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
assert!(
expected_end <= expected_text.len(),
"Posting {} has invalid range {}..{} for document of length {}",
i,
expected_start,
expected_end,
expected_text.len()
);
let expected = &expected_text[expected_start..expected_end];
assert_eq!(
extracted, expected,
"Posting {} extraction mismatch at {}..{}",
i, expected_start, expected_end
);
}
}
}
pub struct TestScenario {
pub name: String,
pub documents: Vec<(DocumentId, String)>,
pub expected_operations: usize,
pub performance_expectations: Option<Duration>,
}
impl TestScenario {
pub fn new(name: &str) -> Self {
Self {
name: name.to_string(),
documents: Vec::new(),
expected_operations: 0,
performance_expectations: None,
}
}
pub fn add_document(mut self, text: String) -> Self {
let doc_id = DocumentId::new();
self.documents.push((doc_id, text));
self
}
pub fn expect_completion_within(mut self, max_duration: Duration) -> Self {
self.performance_expectations = Some(max_duration);
self
}
pub fn execute(&self, storage: &mut DocumentTextStorage) -> Duration {
let start = Instant::now();
for (doc_id, text) in &self.documents {
storage.store_text_safe(*doc_id, text).unwrap_or_else(|e| {
panic!("Failed to store document in scenario '{}': {:?}", self.name, e);
});
let retrieved = storage.get_text_safe(*doc_id).unwrap();
assert_eq!(retrieved, *text);
}
let duration = start.elapsed();
if let Some(max_duration) = self.performance_expectations {
assert!(
duration <= max_duration,
"Scenario '{}' took {:?}, expected <= {:?}",
self.name,
duration,
max_duration
);
}
duration
}
}
pub fn create_test_document(size_category: DocumentSize) -> String {
let mut generator = TextGenerator::new();
match size_category {
DocumentSize::Tiny => generator.generate_text(10), DocumentSize::Small => generator.generate_text(100), DocumentSize::Medium => generator.generate_text(1000), DocumentSize::Large => generator.generate_text(10000), DocumentSize::Huge => generator.generate_text(100000), }
}
#[derive(Debug, Clone, Copy)]
pub enum DocumentSize {
Tiny,
Small,
Medium,
Large,
Huge,
}
pub fn create_test_postings(
document_id: DocumentId,
text: &str,
vector_dimension: usize,
posting_count: usize,
) -> Result<Vec<Posting>, ShardexError> {
let mut generator = PostingGenerator::new();
generator.generate_postings(document_id, text, vector_dimension, posting_count)
}
pub fn measure_operation<F, R>(operation: F) -> (R, Duration)
where
F: FnOnce() -> R,
{
let start = Instant::now();
let result = operation();
let duration = start.elapsed();
(result, duration)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text_generator() {
let mut generator = TextGenerator::new();
let text1 = generator.generate_text(10);
let text2 = generator.generate_text(10);
assert_ne!(text1, text2); assert!(!text1.is_empty());
assert!(!text2.is_empty());
let structured = generator.generate_structured_text(2, 3, 5);
assert!(structured.contains('\n'));
assert!(structured.contains('.'));
}
#[test]
fn test_posting_generator() {
let mut generator = PostingGenerator::new();
let doc_id = DocumentId::new();
let text = "The quick brown fox jumps over the lazy dog";
let postings = generator.generate_postings(doc_id, text, 64, 3).unwrap();
assert_eq!(postings.len(), 3);
for posting in postings {
assert_eq!(posting.document_id, doc_id);
assert!(posting.start < text.len() as u32);
assert!(posting.length > 0);
assert_eq!(posting.vector.len(), 64);
}
}
#[test]
fn test_performance_tracker() {
let mut tracker = PerformanceTracker::new();
let result = tracker.measure("test_operation", || {
std::thread::sleep(Duration::from_millis(10));
42
});
assert_eq!(result, 42);
assert_eq!(tracker.measurements().len(), 1);
assert!(tracker.total_time() >= Duration::from_millis(10));
}
#[test]
fn test_document_test_environment() {
let mut env = DocumentTestEnvironment::new_default("test_env");
let doc_id = DocumentId::new();
let text = "Test document for environment";
env.storage().store_text_safe(doc_id, text).unwrap();
let retrieved = env.storage_ref().get_text_safe(doc_id).unwrap();
assert_eq!(retrieved, text);
env.reopen();
let recovered = env.storage_ref().get_text_safe(doc_id).unwrap();
assert_eq!(recovered, text);
}
}