shardex 0.1.0

A high-performance memory-mapped vector search engine with ACID transactions and incremental updates
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
//! Integration tests for document text storage workflows
//!
//! Tests complete document workflows including:
//! - Document replacement with postings
//! - Text extraction from search results  
//! - Cross-component integration
//! - Crash recovery scenarios
//! - Realistic usage patterns

use shardex::error::ShardexError;
use shardex::identifiers::DocumentId;
use shardex::structures::Posting;
use shardex::DocumentTextStorage;
use tempfile::TempDir;

/// Create a test embedding vector with the specified dimension
fn create_test_embedding(dimension: usize) -> Vec<f32> {
    (0..dimension)
        .map(|i| (i as f32) / (dimension as f32))
        .collect()
}

/// Create test postings for a document with realistic text coordinates
fn create_test_postings(
    document_id: DocumentId,
    text: &str,
    vector_dimension: usize,
    posting_count: usize,
) -> Result<Vec<Posting>, ShardexError> {
    let mut postings = Vec::new();
    let text_len = text.len();

    if posting_count == 0 {
        return Ok(postings);
    }

    // Create evenly distributed postings across the text
    let segment_size = text_len / posting_count;

    for i in 0..posting_count {
        let start = (i * segment_size) as u32;
        let length = if i == posting_count - 1 {
            // Last posting gets remaining text
            (text_len - (i * segment_size)) as u32
        } else {
            segment_size as u32
        };

        // Ensure we don't exceed text bounds
        let actual_length = std::cmp::min(length, (text_len - start as usize) as u32);

        if actual_length == 0 {
            break; // Skip empty postings
        }

        // Create unique vector for each posting
        let vector: Vec<f32> = (0..vector_dimension)
            .map(|j| ((i * vector_dimension + j) as f32) / 1000.0)
            .collect();

        let posting = Posting::new(document_id, start, actual_length, vector, vector_dimension)?;
        postings.push(posting);
    }

    Ok(postings)
}

#[test]
fn test_document_text_storage_creation_and_opening() {
    let temp_dir = TempDir::new().unwrap();
    let max_size = 1024 * 1024; // 1MB

    // Create storage
    {
        let storage = DocumentTextStorage::create(&temp_dir, max_size).unwrap();
        assert_eq!(storage.max_document_size(), max_size);
        assert!(storage.is_empty());
    }

    // Verify files exist
    assert!(temp_dir.path().join("text_index.dat").exists());
    assert!(temp_dir.path().join("text_data.dat").exists());

    // Open existing storage
    {
        let storage = DocumentTextStorage::open(&temp_dir).unwrap();
        assert!(storage.is_empty());
    }
}

#[test]
fn test_complete_document_storage_workflow() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    let doc_id = DocumentId::new();
    let text =
        "The quick brown fox jumps over the lazy dog. This is a sample document for testing text extraction workflows.";

    // Store document text
    storage.store_text_safe(doc_id, text).unwrap();

    // Verify document text retrieval
    let retrieved_text = storage.get_text_safe(doc_id).unwrap();
    assert_eq!(retrieved_text, text);

    // Create postings for the document
    let postings = create_test_postings(doc_id, text, 128, 3).unwrap();
    assert_eq!(postings.len(), 3);

    // Verify posting text extraction
    for posting in &postings {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &text[expected_start..expected_end];

        assert_eq!(
            extracted, expected,
            "Text mismatch for posting at {}..{}",
            expected_start, expected_end
        );
    }
}

#[test]
fn test_document_replacement_workflow() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    let doc_id = DocumentId::new();
    let original_text = "Original document content for replacement testing.";
    let updated_text = "Updated document content with different text for replacement workflow validation.";

    // Store original document
    storage.store_text_safe(doc_id, original_text).unwrap();
    assert_eq!(storage.get_text_safe(doc_id).unwrap(), original_text);

    // Create postings for original text
    let original_postings = create_test_postings(doc_id, original_text, 64, 2).unwrap();

    // Verify original postings work
    for posting in &original_postings {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &original_text[expected_start..expected_end];

        assert_eq!(extracted, expected);
    }

    // Replace document with updated content
    storage.store_text_safe(doc_id, updated_text).unwrap();
    assert_eq!(storage.get_text_safe(doc_id).unwrap(), updated_text);

    // Create new postings for updated text
    let updated_postings = create_test_postings(doc_id, updated_text, 64, 3).unwrap();

    // Verify new postings work with updated text
    for posting in &updated_postings {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &updated_text[expected_start..expected_end];

        assert_eq!(extracted, expected);
    }

    // Old postings should fail with the new text (invalid ranges)
    for posting in &original_postings {
        if posting.start as usize + posting.length as usize > updated_text.len() {
            let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);
            assert!(result.is_err(), "Expected error for out-of-bounds posting");
        }
    }

    // Verify storage shows both versions (append-only)
    assert_eq!(storage.entry_count(), 2);
}

#[test]
fn test_multiple_documents_workflow() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    let documents = vec![
        ("The quick brown fox jumps over the lazy dog.", 2),
        (
            "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
            4,
        ),
        (
            "Rust is a systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.",
            3,
        ),
        (
            "Vector databases are specialized databases designed to store and search high-dimensional vector data efficiently.",
            3,
        ),
        (
            "Machine learning models generate embeddings that capture semantic meaning in numerical vector form.",
            2,
        ),
    ];

    let mut doc_data = Vec::new();

    // Store all documents with their postings
    for (text, posting_count) in &documents {
        let doc_id = DocumentId::new();

        // Store document text
        storage.store_text_safe(doc_id, text).unwrap();

        // Create postings for the document
        let postings = create_test_postings(doc_id, text, 128, *posting_count).unwrap();

        doc_data.push((doc_id, text, postings));
    }

    // Verify all documents and their postings
    for (doc_id, expected_text, postings) in &doc_data {
        // Verify document text retrieval
        let retrieved_text = storage.get_text_safe(*doc_id).unwrap();
        assert_eq!(retrieved_text, **expected_text);

        // Verify all postings for this document
        for posting in postings {
            let extracted = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            let expected_start = posting.start as usize;
            let expected_end = expected_start + posting.length as usize;
            let expected = &expected_text[expected_start..expected_end];

            assert_eq!(
                extracted, expected,
                "Text mismatch for document {} posting at {}..{}",
                doc_id, expected_start, expected_end
            );
        }
    }

    assert_eq!(storage.entry_count(), documents.len() as u32);
}

#[test]
fn test_unicode_document_integration() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    let unicode_documents = vec![
        ("Chinese", "你好世界!这是一个中文测试文档。"),
        ("Japanese", "こんにちは世界!これは日本語のテスト文書です。"),
        ("Arabic", "مرحبا بالعالم! هذا مستند اختبار باللغة العربية."),
        ("Emoji", "Hello 🌍 World! 🚀 This is a test with emojis 🎉✨"),
        ("Mixed", "English 中文 日本語 العربية 🌍 混合内容文档"),
    ];

    let mut unicode_data = Vec::new();

    // Store all Unicode documents
    for (name, text) in &unicode_documents {
        let doc_id = DocumentId::new();

        storage.store_text_safe(doc_id, text).unwrap();

        // Create postings with careful attention to UTF-8 boundaries
        let postings = create_test_postings(doc_id, text, 64, 2).unwrap();

        unicode_data.push((doc_id, *name, *text, postings));
    }

    // Verify all Unicode documents and postings work correctly
    for (doc_id, name, expected_text, postings) in &unicode_data {
        let retrieved_text = storage.get_text_safe(*doc_id).unwrap();
        assert_eq!(retrieved_text, *expected_text, "Text mismatch for {}", name);

        for posting in postings {
            // Verify posting extracts valid text (respecting UTF-8 boundaries)
            let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);

            match result {
                Ok(extracted) => {
                    // Extracted text should be valid UTF-8 and non-empty
                    assert!(!extracted.is_empty(), "Empty extraction for {}", name);
                    assert!(extracted.chars().count() > 0, "No valid characters for {}", name);
                }
                Err(ShardexError::InvalidRange { .. }) => {
                    // This is acceptable for UTF-8 boundary issues
                    // The important thing is that we don't panic or return invalid UTF-8
                }
                Err(e) => panic!("Unexpected error for {} posting: {:?}", name, e),
            }
        }
    }
}

#[test]
fn test_crash_recovery_simulation() {
    let temp_dir = TempDir::new().unwrap();
    let doc_id = DocumentId::new();
    let text = "Document for crash recovery testing. This text should survive across storage instances.";

    // First session - create and populate storage
    {
        let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();
        storage.store_text_safe(doc_id, text).unwrap();

        // Verify storage before "crash"
        assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
        assert_eq!(storage.entry_count(), 1);

        // Create postings and verify they work
        let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
        for posting in &postings {
            let extracted = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            let expected_start = posting.start as usize;
            let expected_end = expected_start + posting.length as usize;
            let expected = &text[expected_start..expected_end];
            assert_eq!(extracted, expected);
        }

        // Explicit sync to ensure data is written
        storage.sync().unwrap();
    } // Storage dropped here, simulating crash

    // Second session - recover from "crash"
    {
        let storage = DocumentTextStorage::open(&temp_dir).unwrap();

        // Verify data survived the crash
        assert_eq!(storage.entry_count(), 1);
        let recovered_text = storage.get_text_safe(doc_id).unwrap();
        assert_eq!(recovered_text, text);

        // Verify postings still work after recovery
        let postings = create_test_postings(doc_id, text, 64, 2).unwrap();
        for posting in &postings {
            let extracted = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            let expected_start = posting.start as usize;
            let expected_end = expected_start + posting.length as usize;
            let expected = &text[expected_start..expected_end];
            assert_eq!(extracted, expected);
        }
    }

    // Third session - add more data after recovery
    {
        let mut storage = DocumentTextStorage::open(&temp_dir).unwrap();
        let additional_doc = DocumentId::new();
        let additional_text = "Additional document added after recovery.";

        // Add new document
        storage
            .store_text_safe(additional_doc, additional_text)
            .unwrap();

        // Verify both old and new documents work
        assert_eq!(storage.entry_count(), 2);
        assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
        assert_eq!(storage.get_text_safe(additional_doc).unwrap(), additional_text);

        storage.sync().unwrap();
    }

    // Fourth session - final verification
    {
        let storage = DocumentTextStorage::open(&temp_dir).unwrap();
        assert_eq!(storage.entry_count(), 2);
        assert_eq!(storage.get_text_safe(doc_id).unwrap(), text);
    }
}

#[test]
fn test_large_document_integration() {
    let temp_dir = TempDir::new().unwrap();
    let large_limit = 5 * 1024 * 1024; // 5MB limit
    let mut storage = DocumentTextStorage::create(&temp_dir, large_limit).unwrap();

    let doc_id = DocumentId::new();

    // Create a large document (1MB)
    let base_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ";
    let large_text = base_text.repeat(5000); // Approximately 1MB

    // Store large document
    storage.store_text_safe(doc_id, &large_text).unwrap();

    // Verify retrieval
    let retrieved = storage.get_text_safe(doc_id).unwrap();
    assert_eq!(retrieved, large_text);
    assert_eq!(retrieved.len(), large_text.len());

    // Create postings across the large document
    let postings = create_test_postings(doc_id, &large_text, 128, 10).unwrap();
    assert_eq!(postings.len(), 10);

    // Verify all postings work correctly
    for (i, posting) in postings.iter().enumerate() {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &large_text[expected_start..expected_end];

        assert_eq!(
            extracted, expected,
            "Mismatch for posting {} at {}..{}",
            i, expected_start, expected_end
        );
        assert!(!extracted.is_empty(), "Empty extraction for posting {}", i);
    }

    // Test performance with large document
    let start_time = std::time::Instant::now();
    let _retrieved = storage.get_text_safe(doc_id).unwrap();
    let retrieval_duration = start_time.elapsed();

    // Should be reasonably fast (less than 100ms for 1MB)
    assert!(
        retrieval_duration.as_millis() < 100,
        "Large document retrieval too slow: {:?}",
        retrieval_duration
    );
}

#[test]
fn test_document_versioning_with_postings() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    let doc_id = DocumentId::new();
    let versions = [
        "Version 1: Initial document content for versioning test.",
        "Version 2: Updated document with additional content for more comprehensive testing of versioning workflow.",
        "Version 3: Final version with extensive content to test how postings work across document versions and updates.",
    ];

    let mut version_postings = Vec::new();

    // Store each version and create postings
    for (version_num, text) in versions.iter().enumerate() {
        storage.store_text_safe(doc_id, text).unwrap();

        // Verify this version is retrievable
        let retrieved = storage.get_text_safe(doc_id).unwrap();
        assert_eq!(retrieved, *text, "Version {} not stored correctly", version_num + 1);

        // Create postings for this version
        let postings = create_test_postings(doc_id, text, 64, 2).unwrap();

        // Verify postings work for current version
        for posting in &postings {
            let extracted = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            let expected_start = posting.start as usize;
            let expected_end = expected_start + posting.length as usize;
            let expected = &text[expected_start..expected_end];
            assert_eq!(extracted, expected);
        }

        version_postings.push(postings);
    }

    // Verify final state
    assert_eq!(storage.entry_count(), 3); // All versions stored (append-only)
    assert_eq!(storage.get_text_safe(doc_id).unwrap(), versions[2]); // Latest version

    // Verify that only the latest version's postings work
    let latest_text = versions[2];
    let latest_postings = &version_postings[2];

    for posting in latest_postings {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &latest_text[expected_start..expected_end];
        assert_eq!(extracted, expected);
    }

    // Older version postings may fail if they exceed the latest document bounds
    let first_version_postings = &version_postings[0];
    for posting in first_version_postings {
        let result = storage.extract_text_substring(posting.document_id, posting.start, posting.length);

        if posting.start as usize + posting.length as usize > latest_text.len() {
            // Should fail for out-of-bounds
            assert!(result.is_err());
        } else {
            // Should succeed if within bounds, but may not match original text
            assert!(result.is_ok());
        }
    }
}

#[test]
fn test_realistic_usage_workflow() {
    let temp_dir = TempDir::new().unwrap();
    let mut storage = DocumentTextStorage::create(&temp_dir, 1024 * 1024).unwrap();

    // Simulate realistic document processing workflow
    let documents = [
        (
            "Technical documentation about Rust programming language features and best practices.",
            vec![(0, 9), (10, 13), (24, 5), (42, 8), (51, 8), (60, 3)], // Word boundaries
        ),
        (
            "Machine learning algorithms for natural language processing and text analysis.",
            vec![(0, 7), (8, 8), (17, 10), (31, 7), (39, 8), (48, 10)],
        ),
        (
            "Database design principles for scalable web applications and data management.",
            vec![(0, 8), (9, 6), (16, 10), (30, 9), (43, 11), (55, 4)],
        ),
    ];

    let mut doc_data = Vec::new();

    // Process each document
    for (i, (text, word_positions)) in documents.iter().enumerate() {
        let doc_id = DocumentId::new();

        // Store document
        storage.store_text_safe(doc_id, text).unwrap();

        // Create postings based on realistic word boundaries
        let mut postings = Vec::new();
        for (start, length) in word_positions {
            let vector = create_test_embedding(128);
            let posting = Posting::new(doc_id, *start as u32, *length as u32, vector, 128).unwrap();
            postings.push(posting);
        }

        doc_data.push((doc_id, *text, postings.clone()));

        // Verify document and postings work
        let retrieved = storage.get_text_safe(doc_id).unwrap();
        assert_eq!(retrieved, *text);

        // Verify all postings extract the expected words/phrases
        for posting in &postings {
            let extracted = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            let expected_start = posting.start as usize;
            let expected_end = expected_start + posting.length as usize;
            let expected = &text[expected_start..expected_end];

            assert_eq!(extracted, expected);
            assert!(!extracted.is_empty());
        }

        println!("Processed document {} with {} postings", i + 1, postings.len());
    }

    // Simulate search-like access pattern
    for (doc_id, text, postings) in &doc_data {
        // Retrieve document
        let full_text = storage.get_text_safe(*doc_id).unwrap();
        assert_eq!(full_text, *text);

        // Extract text for each posting (simulating search result processing)
        for posting in postings {
            let snippet = storage
                .extract_text_substring(posting.document_id, posting.start, posting.length)
                .unwrap();

            // Verify snippet is meaningful
            assert!(!snippet.is_empty());
            assert!(!snippet.trim().is_empty());

            // Verify snippet is part of the original document
            assert!(text.contains(&snippet));
        }
    }

    // Final statistics
    assert_eq!(storage.entry_count(), documents.len() as u32);
    assert!(storage.total_text_size() > 0);
    assert!(storage.utilization_ratio() > 0.0);

    println!("Integration test completed successfully:");
    println!("  Documents: {}", documents.len());
    println!("  Total entries: {}", storage.entry_count());
    println!("  Total text size: {} bytes", storage.total_text_size());
    println!("  Utilization ratio: {:.2}", storage.utilization_ratio());
}

#[test]
fn test_error_recovery_integration() {
    let temp_dir = TempDir::new().unwrap();
    let small_limit = 200; // Small limit to trigger errors
    let mut storage = DocumentTextStorage::create(&temp_dir, small_limit).unwrap();

    let doc_id1 = DocumentId::new();
    let doc_id2 = DocumentId::new();
    let doc_id3 = DocumentId::new();

    let small_text = "Small text that fits within limits.";
    let large_text = "This is a much larger text that will definitely exceed the small size limit that we have configured for this test to ensure proper error handling and recovery. We need to add even more text here to make it exceed the 200 byte limit for sure, so here is some additional padding text that should push it well over the limit.";
    let another_small = "Another small text.";

    // Store first document successfully
    storage.store_text_safe(doc_id1, small_text).unwrap();
    assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);

    // Try to store oversized document (should fail)
    let result = storage.store_text_safe(doc_id2, large_text);
    assert!(result.is_err());
    match result.unwrap_err() {
        ShardexError::DocumentTooLarge { .. } => {
            // Expected error type
        }
        e => panic!("Expected DocumentTooLarge error, got {:?}", e),
    }

    // Verify first document is still accessible after error
    assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);
    assert_eq!(storage.entry_count(), 1); // Only successful store

    // Store third document successfully after error
    storage.store_text_safe(doc_id3, another_small).unwrap();
    assert_eq!(storage.get_text_safe(doc_id3).unwrap(), another_small);
    assert_eq!(storage.entry_count(), 2);

    // Verify both successful documents work
    assert_eq!(storage.get_text_safe(doc_id1).unwrap(), small_text);
    assert_eq!(storage.get_text_safe(doc_id3).unwrap(), another_small);

    // Verify failed document is not found
    let result = storage.get_text_safe(doc_id2);
    assert!(result.is_err());
    match result.unwrap_err() {
        ShardexError::DocumentTextNotFound { .. } => {
            // Expected error type
        }
        e => panic!("Expected DocumentTextNotFound error, got {:?}", e),
    }

    // Create postings for successful documents
    let postings1 = create_test_postings(doc_id1, small_text, 64, 1).unwrap();
    let postings3 = create_test_postings(doc_id3, another_small, 64, 1).unwrap();

    // Verify postings work for both successful documents
    for posting in &postings1 {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &small_text[expected_start..expected_end];
        assert_eq!(extracted, expected);
    }

    for posting in &postings3 {
        let extracted = storage
            .extract_text_substring(posting.document_id, posting.start, posting.length)
            .unwrap();

        let expected_start = posting.start as usize;
        let expected_end = expected_start + posting.length as usize;
        let expected = &another_small[expected_start..expected_end];
        assert_eq!(extracted, expected);
    }
}