use shardex::error::ShardexError;
use shardex::identifiers::DocumentId;
use shardex::structures::Posting;
use shardex::DocumentTextStorage;
use tempfile::TempDir;
fn create_test_embedding(dimension: usize) -> Vec<f32> {
(0..dimension)
.map(|i| (i as f32) / (dimension as f32))
.collect()
}
fn create_test_postings(
document_id: DocumentId,
text: &str,
vector_dimension: usize,
posting_count: usize,
) -> Result<Vec<Posting>, ShardexError> {
let mut postings = Vec::new();
let text_len = text.len();
if posting_count == 0 {
return Ok(postings);
}
let segment_size = text_len / posting_count;
for i in 0..posting_count {
let start = (i * segment_size) as u32;
let length = if i == posting_count - 1 {
(text_len - (i * segment_size)) as u32
} else {
segment_size as u32
};
let actual_length = std::cmp::min(length, (text_len - start as usize) as u32);
if actual_length == 0 {
break; }
let vector: Vec<f32> = (0..vector_dimension)
.map(|j| ((i * vector_dimension + j) as f32) / 1000.0)
.collect();
let posting = Posting::new(document_id, start, actual_length, vector, vector_dimension)?;
postings.push(posting);
}
Ok(postings)
}
#[test]
fn test_document_text_storage_creation_and_opening() {
let temp_dir = TempDir::new().unwrap();
let max_size = 1024 * 1024;
{
let storage = DocumentTextStorage::create(&temp_dir, max_size).unwrap();
assert_eq!(storage.max_document_size(), max_size);
assert!(storage.is_empty());
}
assert!(temp_dir.path().join("text_index.dat").exists());
assert!(temp_dir.path().join("text_data.dat").exists());
{
let storage = DocumentTextStorage::open(&temp_dir).unwrap();
assert!(storage.is_empty());
}
}
#[test]
fn test_complete_document_storage_workflow() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let doc_id = DocumentId::new();
let text =
"The quick brown fox jumps over the lazy dog. This is a sample document for testing text extraction workflows.";
storage.store_text_safe(doc_id, text).unwrap();
let retrieved_text = storage.get_text_safe(doc_id).unwrap();
assert_eq!(retrieved_text, text);
let postings = create_test_postings(doc_id, text, 128, 3).unwrap();
assert_eq!(postings.len(), 3);
for posting in &postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &text[expected_start..expected_end];
assert_eq!(
extracted, expected,
"Text mismatch for posting at {}..{}",
expected_start, expected_end
);
}
}
#[test]
fn test_document_replacement_workflow() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let doc_id = DocumentId::new();
let original_text = "Original document content for replacement testing.";
let updated_text = "Updated document content with different text for replacement workflow validation.";
storage.store_text_safe(doc_id, original_text).unwrap();
assert_eq!(storage.get_text_safe(doc_id).unwrap(), original_text);
let original_postings = create_test_postings(doc_id, original_text, 64, 2).unwrap();
for posting in &original_postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &original_text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
storage.store_text_safe(doc_id, updated_text).unwrap();
assert_eq!(storage.get_text_safe(doc_id).unwrap(), updated_text);
let updated_postings = create_test_postings(doc_id, updated_text, 64, 3).unwrap();
for posting in &updated_postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &updated_text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
for posting in &original_postings {
if posting.start as usize + posting.length as usize > updated_text.len() {
let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);
assert!(result.is_err(), "Expected error for out-of-bounds posting");
}
}
assert_eq!(storage.entry_count(), 2);
}
#[test]
fn test_multiple_documents_workflow() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let documents = vec![
("The quick brown fox jumps over the lazy dog.", 2),
(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
4,
),
(
"Rust is a systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.",
3,
),
(
"Vector databases are specialized databases designed to store and search high-dimensional vector data efficiently.",
3,
),
(
"Machine learning models generate embeddings that capture semantic meaning in numerical vector form.",
2,
),
];
let mut doc_data = Vec::new();
for (text, posting_count) in &documents {
let doc_id = DocumentId::new();
storage.store_text_safe(doc_id, text).unwrap();
let postings = create_test_postings(doc_id, text, 128, *posting_count).unwrap();
doc_data.push((doc_id, text, postings));
}
for (doc_id, expected_text, postings) in &doc_data {
let retrieved_text = storage.get_text_safe(*doc_id).unwrap();
assert_eq!(retrieved_text, **expected_text);
for posting in postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &expected_text[expected_start..expected_end];
assert_eq!(
extracted, expected,
"Text mismatch for document {} posting at {}..{}",
doc_id, expected_start, expected_end
);
}
}
assert_eq!(storage.entry_count(), documents.len() as u32);
}
#[test]
fn test_unicode_document_integration() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let unicode_documents = vec![
("Chinese", "你好世界!这是一个中文测试文档。"),
("Japanese", "こんにちは世界!これは日本語のテスト文書です。"),
("Arabic", "مرحبا بالعالم! هذا مستند اختبار باللغة العربية."),
("Emoji", "Hello 🌍 World! 🚀 This is a test with emojis 🎉✨"),
("Mixed", "English 中文 日本語 العربية 🌍 混合内容文档"),
];
let mut unicode_data = Vec::new();
for (name, text) in &unicode_documents {
let doc_id = DocumentId::new();
storage.store_text_safe(doc_id, text).unwrap();
let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
unicode_data.push((doc_id, *name, *text, postings));
}
for (doc_id, name, expected_text, postings) in &unicode_data {
let retrieved_text = storage.get_text_safe(*doc_id).unwrap();
assert_eq!(retrieved_text, *expected_text, "Text mismatch for {}", name);
for posting in postings {
let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);
match result {
Ok(extracted) => {
assert!(!extracted.is_empty(), "Empty extraction for {}", name);
assert!(extracted.chars().count() > 0, "No valid characters for {}", name);
}
Err(ShardexError::InvalidRange { .. }) => {
}
Err(e) => panic!("Unexpected error for {} posting: {:?}", name, e),
}
}
}
}
#[test]
fn test_crash_recovery_simulation() {
let temp_dir = TempDir::new().unwrap();
let doc_id = DocumentId::new();
let text = "Document for crash recovery testing. This text should survive across storage instances.";
{
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
storage.store_text_safe(doc_id, text).unwrap();
assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
assert_eq!(storage.entry_count(), 1);
let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
for posting in &postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
storage.sync().unwrap();
}
{
let storage = DocumentTextStorage::open(&temp_dir).unwrap();
assert_eq!(storage.entry_count(), 1);
let recovered_text = storage.get_text_safe(doc_id).unwrap();
assert_eq!(recovered_text, text);
let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
for posting in &postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
}
{
let mut storage = DocumentTextStorage::open(&temp_dir).unwrap();
let additional_doc = DocumentId::new();
let additional_text = "Additional document added after recovery.";
storage
.store_text_safe(additional_doc, additional_text)
.unwrap();
assert_eq!(storage.entry_count(), 2);
assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
assert_eq!(storage.get_text_safe(additional_doc).unwrap(), additional_text);
storage.sync().unwrap();
}
{
let storage = DocumentTextStorage::open(&temp_dir).unwrap();
assert_eq!(storage.entry_count(), 2);
assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
}
}
#[test]
fn test_large_document_integration() {
let temp_dir = TempDir::new().unwrap();
let large_limit = 5 * 1024 * 1024; let mut storage = DocumentTextStorage::create(&temp_dir, large_limit).unwrap();
let doc_id = DocumentId::new();
let base_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ";
let large_text = base_text.repeat(5000);
storage.store_text_safe(doc_id, &large_text).unwrap();
let retrieved = storage.get_text_safe(doc_id).unwrap();
assert_eq!(retrieved, large_text);
assert_eq!(retrieved.len(), large_text.len());
let postings = create_test_postings(doc_id, &large_text, 128, 10).unwrap();
assert_eq!(postings.len(), 10);
for (i, posting) in postings.iter().enumerate() {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &large_text[expected_start..expected_end];
assert_eq!(
extracted, expected,
"Mismatch for posting {} at {}..{}",
i, expected_start, expected_end
);
assert!(!extracted.is_empty(), "Empty extraction for posting {}", i);
}
let start_time = std::time::Instant::now();
let _retrieved = storage.get_text_safe(doc_id).unwrap();
let retrieval_duration = start_time.elapsed();
assert!(
retrieval_duration.as_millis() < 100,
"Large document retrieval too slow: {:?}",
retrieval_duration
);
}
#[test]
fn test_document_versioning_with_postings() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let doc_id = DocumentId::new();
let versions = [
"Version 1: Initial document content for versioning test.",
"Version 2: Updated document with additional content for more comprehensive testing of versioning workflow.",
"Version 3: Final version with extensive content to test how postings work across document versions and updates.",
];
let mut version_postings = Vec::new();
for (version_num, text) in versions.iter().enumerate() {
storage.store_text_safe(doc_id, text).unwrap();
let retrieved = storage.get_text_safe(doc_id).unwrap();
assert_eq!(retrieved, *text, "Version {} not stored correctly", version_num + 1);
let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
for posting in &postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
version_postings.push(postings);
}
assert_eq!(storage.entry_count(), 3); assert_eq!(storage.get_text_safe(doc_id).unwrap(), versions[2]);
let latest_text = versions[2];
let latest_postings = &version_postings[2];
for posting in latest_postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &latest_text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
let first_version_postings = &version_postings[0];
for posting in first_version_postings {
let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);
if posting.start as usize + posting.length as usize > latest_text.len() {
assert!(result.is_err());
} else {
assert!(result.is_ok());
}
}
}
#[test]
fn test_realistic_usage_workflow() {
let temp_dir = TempDir::new().unwrap();
let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
let documents = [
(
"Technical documentation about Rust programming language features and best practices.",
vec![(0, 9), (10, 13), (24, 5), (42, 8), (51, 8), (60, 3)], ),
(
"Machine learning algorithms for natural language processing and text analysis.",
vec![(0, 7), (8, 8), (17, 10), (31, 7), (39, 8), (48, 10)],
),
(
"Database design principles for scalable web applications and data management.",
vec![(0, 8), (9, 6), (16, 10), (30, 9), (43, 11), (55, 4)],
),
];
let mut doc_data = Vec::new();
for (i, (text, word_positions)) in documents.iter().enumerate() {
let doc_id = DocumentId::new();
storage.store_text_safe(doc_id, text).unwrap();
let mut postings = Vec::new();
for (start, length) in word_positions {
let vector = create_test_embedding(128);
let posting = Posting::new(doc_id, *start as u32, *length as u32, vector, 128).unwrap();
postings.push(posting);
}
doc_data.push((doc_id, *text, postings.clone()));
let retrieved = storage.get_text_safe(doc_id).unwrap();
assert_eq!(retrieved, *text);
for posting in &postings {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &text[expected_start..expected_end];
assert_eq!(extracted, expected);
assert!(!extracted.is_empty());
}
println!("Processed document {} with {} postings", i + 1, postings.len());
}
for (doc_id, text, postings) in &doc_data {
let full_text = storage.get_text_safe(*doc_id).unwrap();
assert_eq!(full_text, *text);
for posting in postings {
let snippet = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
assert!(!snippet.is_empty());
assert!(!snippet.trim().is_empty());
assert!(text.contains(&snippet));
}
}
assert_eq!(storage.entry_count(), documents.len() as u32);
assert!(storage.total_text_size() > 0);
assert!(storage.utilization_ratio() > 0.0);
println!("Integration test completed successfully:");
println!(" Documents: {}", documents.len());
println!(" Total entries: {}", storage.entry_count());
println!(" Total text size: {} bytes", storage.total_text_size());
println!(" Utilization ratio: {:.2}", storage.utilization_ratio());
}
#[test]
fn test_error_recovery_integration() {
let temp_dir = TempDir::new().unwrap();
let small_limit = 200; let mut storage = DocumentTextStorage::create(&temp_dir, small_limit).unwrap();
let doc_id1 = DocumentId::new();
let doc_id2 = DocumentId::new();
let doc_id3 = DocumentId::new();
let small_text = "Small text that fits within limits.";
let large_text = "This is a much larger text that will definitely exceed the small size limit that we have configured for this test to ensure proper error handling and recovery. We need to add even more text here to make it exceed the 200 byte limit for sure, so here is some additional padding text that should push it well over the limit.";
let another_small = "Another small text.";
storage.store_text_safe(doc_id1, small_text).unwrap();
assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);
let result = storage.store_text_safe(doc_id2, large_text);
assert!(result.is_err());
match result.unwrap_err() {
ShardexError::DocumentTooLarge { .. } => {
}
e => panic!("Expected DocumentTooLarge error, got {:?}", e),
}
assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);
assert_eq!(storage.entry_count(), 1);
storage.store_text_safe(doc_id3, another_small).unwrap();
assert_eq!(storage.get_text_safe(doc_id3).unwrap(), another_small);
assert_eq!(storage.entry_count(), 2);
assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);
assert_eq!(storage.get_text_safe(doc_id3).unwrap(), another_small);
let result = storage.get_text_safe(doc_id2);
assert!(result.is_err());
match result.unwrap_err() {
ShardexError::DocumentTextNotFound { .. } => {
}
e => panic!("Expected DocumentTextNotFound error, got {:?}", e),
}
let postings1 = create_test_postings(doc_id1, small_text, 64, 1).unwrap();
let postings3 = create_test_postings(doc_id3, another_small, 64, 1).unwrap();
for posting in &postings1 {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &small_text[expected_start..expected_end];
assert_eq!(extracted, expected);
}
for posting in &postings3 {
let extracted = storage
.extract_text_substring(posting.document_id, posting.start, posting.length)
.unwrap();
let expected_start = posting.start as usize;
let expected_end = expected_start + posting.length as usize;
let expected = &another_small[expected_start..expected_end];
assert_eq!(extracted, expected);
}
}