seerdb 0.0.10

Research-grade storage engine with learned data structures
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
// Data Integrity Tests
//
// Comprehensive tests for data loss scenarios identified in code review.
// These tests cover critical gaps that could cause data loss in production.
//
// Categories:
// 1. WAL integrity (partial writes, corruption)
// 2. Flush ordering (SSTable write vs WAL clear)
// 3. Compaction integrity (tombstone shadowing, crash safety)
// 4. Concurrent operations (partition swap, read during flush)
// 5. Edge cases (empty values, boundary conditions)

use seerdb::{DBOptions, SyncPolicy, DB};
use std::fs::{self, OpenOptions};
use std::io::{Seek, SeekFrom, Write};
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use tempfile::TempDir;

// =============================================================================
// 1. WAL INTEGRITY TESTS
// =============================================================================

/// Test WAL recovery with truncated record (simulates crash mid-write)
///
/// Risk: HIGH - Partial record could corrupt recovery
/// Expected: Recovery should skip incomplete record, preserve complete ones
#[test]
fn test_wal_truncated_record_recovery() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    // Write data to WAL
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .open(&db_path)
            .unwrap();

        // Write 100 records
        for i in 0..100 {
            db.put(format!("key_{:03}", i).as_bytes(), b"value")
                .unwrap();
        }
        // Don't flush - all data in WAL
    }

    // Truncate WAL to simulate crash mid-record
    let wal_path = db_path.join("wal.log");
    let original_size = fs::metadata(&wal_path).unwrap().len();

    // Truncate to remove ~10% (simulates incomplete last record)
    let truncated_size = (original_size * 9) / 10;
    {
        let file = OpenOptions::new().write(true).open(&wal_path).unwrap();
        file.set_len(truncated_size).unwrap();
        file.sync_all().unwrap();
    }

    // Recovery should succeed and recover most records
    match DB::open(&db_path) {
        Ok(db) => {
            // Count recovered records
            let recovered = (0..100)
                .filter(|i| {
                    db.get(format!("key_{:03}", i).as_bytes())
                        .unwrap()
                        .is_some()
                })
                .count();

            // Should recover at least 80% (truncated ~10%)
            assert!(
                recovered >= 80,
                "Should recover most records after truncation, got {} / 100",
                recovered
            );

            // Verify no garbage data
            for i in 0..100 {
                if let Some(value) = db.get(format!("key_{:03}", i).as_bytes()).unwrap() {
                    assert_eq!(value.as_ref(), b"value", "Value should be intact");
                }
            }
        }
        Err(e) => {
            // Also acceptable if recovery fails cleanly (strict mode)
            println!("Recovery failed (acceptable): {}", e);
        }
    }
}

/// Test WAL with corrupted record body (not header)
///
/// Risk: HIGH - Per-record corruption must be detected
/// Expected: Corrupted record detected, earlier records preserved
#[test]
fn test_wal_record_body_corruption() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    // Write data
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .open(&db_path)
            .unwrap();

        for i in 0..50 {
            db.put(format!("key_{:03}", i).as_bytes(), b"value")
                .unwrap();
        }
    }

    // Corrupt middle of WAL (not header - header is first 8 bytes)
    let wal_path = db_path.join("wal.log");
    let file_size = fs::metadata(&wal_path).unwrap().len();
    let corrupt_offset = file_size / 2; // Middle of file

    {
        let mut file = OpenOptions::new().write(true).open(&wal_path).unwrap();
        file.seek(SeekFrom::Start(corrupt_offset)).unwrap();
        file.write_all(&[0xFF, 0xFF, 0xFF, 0xFF]).unwrap();
        file.sync_all().unwrap();
    }

    // Recovery behavior depends on implementation:
    // - May recover records before corruption
    // - May fail entirely
    // - Should NOT silently return corrupted data
    match DB::open(&db_path) {
        Ok(db) => {
            // Verify no corrupted values returned
            for i in 0..50 {
                if let Some(value) = db.get(format!("key_{:03}", i).as_bytes()).unwrap() {
                    // Value should be exactly "value" or not present
                    assert_eq!(
                        value.as_ref(),
                        b"value",
                        "Should not return corrupted value"
                    );
                }
            }
        }
        Err(_) => {
            // Corruption detected - acceptable
        }
    }
}

/// Test batch atomicity with partial write
///
/// Risk: HIGH - Batch must be all-or-nothing
/// Expected: Either all records in batch recovered, or none
#[test]
fn test_wal_batch_atomicity() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    // Write batch
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .open(&db_path)
            .unwrap();

        // Use Batch for atomic write
        let mut batch = db.batch();
        for i in 0..10 {
            batch.put(format!("batch_key_{}", i).as_bytes(), b"batch_value");
        }
        batch.commit().unwrap();
    }

    // Reopen and verify batch atomicity
    let db = DB::open(&db_path).unwrap();

    let present: Vec<bool> = (0..10)
        .map(|i| {
            db.get(format!("batch_key_{}", i).as_bytes())
                .unwrap()
                .is_some()
        })
        .collect();

    // Either all present or all absent (atomic)
    let all_present = present.iter().all(|&p| p);
    let all_absent = present.iter().all(|&p| !p);

    assert!(
        all_present || all_absent,
        "Batch must be atomic: either all keys present or all absent. Got: {:?}",
        present
    );

    // In normal case, all should be present
    assert!(
        all_present,
        "All batch keys should be present after clean recovery"
    );
}

// =============================================================================
// 2. FLUSH ORDERING TESTS
// =============================================================================

/// Test that flush writes SSTable before clearing WAL
///
/// Risk: HIGH - If WAL cleared first, crash loses data
/// Expected: After flush + crash, data recoverable from SSTable OR WAL
#[test]
fn test_flush_sstable_before_wal_clear() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    // Write and flush
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .open(&db_path)
            .unwrap();

        for i in 0..1000 {
            db.put(format!("key_{:04}", i).as_bytes(), b"value")
                .unwrap();
        }

        db.flush().unwrap();
    }

    // Verify SSTable exists
    let sstable_exists = fs::read_dir(&db_path)
        .unwrap()
        .filter_map(|e| e.ok())
        .any(|e| e.path().extension().and_then(|s| s.to_str()) == Some("sst"));

    assert!(sstable_exists, "SSTable should exist after flush");

    // Reopen and verify all data present
    let db = DB::open(&db_path).unwrap();

    for i in 0..1000 {
        assert!(
            db.get(format!("key_{:04}", i).as_bytes())
                .unwrap()
                .is_some(),
            "Key {} should exist after flush + reopen",
            i
        );
    }
}

/// Test concurrent flushes don't corrupt sequence numbers
///
/// Risk: HIGH - Out-of-order completion could cause GC of live data
/// Expected: max_flushed_seq always increases monotonically
#[test]
fn test_concurrent_flush_sequence_monotonic() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = Arc::new(
        DBOptions::default()
            .memtable_capacity(1024) // Small to trigger frequent flushes
            .background_compaction(false) // Disable to isolate flush behavior
            .open(&db_path)
            .unwrap(),
    );

    let write_count = Arc::new(AtomicUsize::new(0));
    let stop = Arc::new(AtomicBool::new(false));

    // Spawn writer threads
    let mut handles = vec![];
    for t in 0..4 {
        let db = Arc::clone(&db);
        let write_count = Arc::clone(&write_count);
        let stop = Arc::clone(&stop);

        handles.push(thread::spawn(move || {
            let mut i = 0;
            while !stop.load(Ordering::Relaxed) {
                let key = format!("t{}_{:06}", t, i);
                if db.put(key.as_bytes(), b"value").is_ok() {
                    write_count.fetch_add(1, Ordering::Relaxed);
                }
                i += 1;
                if i > 10000 {
                    break;
                }
            }
        }));
    }

    // Let writes run
    thread::sleep(Duration::from_secs(2));
    stop.store(true, Ordering::Relaxed);

    for h in handles {
        h.join().unwrap();
    }

    let total_writes = write_count.load(Ordering::Relaxed);

    // Flush and close
    db.flush().unwrap();
    drop(db);

    // Reopen and verify data integrity
    let db = DB::open(&db_path).unwrap();

    // Count recovered keys
    let mut recovered = 0;
    for t in 0..4 {
        for i in 0..10001 {
            if db
                .get(format!("t{}_{:06}", t, i).as_bytes())
                .unwrap()
                .is_some()
            {
                recovered += 1;
            }
        }
    }

    // Should recover all or almost all writes
    let recovery_rate = (recovered as f64) / (total_writes as f64);
    assert!(
        recovery_rate > 0.99,
        "Should recover >99% of writes, got {:.1}% ({} / {})",
        recovery_rate * 100.0,
        recovered,
        total_writes
    );
}

// =============================================================================
// 3. COMPACTION INTEGRITY TESTS
// =============================================================================

/// Test tombstone shadows older value across LSM levels
///
/// Risk: HIGH - Delete must always win over older put
/// Expected: After compaction, deleted key returns None
#[test]
fn test_tombstone_shadows_across_levels() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = DBOptions::default()
        .memtable_capacity(1024) // Small to trigger flushes
        .background_compaction(true) // Enable to trigger compaction
        .open(&db_path)
        .unwrap();

    // Write key and flush to L0 (will eventually move to L1+)
    db.put(b"shadowed_key", b"original_value").unwrap();
    db.flush().unwrap();

    // Delete key and flush again (creates tombstone in newer SSTable)
    db.delete(b"shadowed_key").unwrap();
    db.flush().unwrap();

    // Write more data to create multiple L0 SSTables (triggers compaction)
    for batch in 0..5 {
        for i in 0..50 {
            db.put(format!("filler_{}_{:03}", batch, i).as_bytes(), b"filler")
                .unwrap();
        }
        db.flush().unwrap();
    }

    // Allow background compaction to run
    thread::sleep(Duration::from_millis(500));

    // Key should be deleted (tombstone wins)
    assert!(
        db.get(b"shadowed_key").unwrap().is_none(),
        "Tombstone should shadow original value after compaction"
    );

    // Reopen and verify still deleted
    drop(db);
    let db = DBOptions::default()
        .background_compaction(true)
        .open(&db_path)
        .unwrap();

    assert!(
        db.get(b"shadowed_key").unwrap().is_none(),
        "Tombstone should persist after reopen"
    );
}

/// Test compaction doesn't lose data on crash
///
/// Risk: MEDIUM - Partial compaction output could corrupt data
/// Expected: Original SSTables intact until compaction fully complete
#[test]
fn test_compaction_preserves_all_data() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = DBOptions::default()
        .memtable_capacity(2048)
        .background_compaction(true) // Enable compaction
        .open(&db_path)
        .unwrap();

    // Create multiple SSTables with overlapping key ranges
    for batch in 0..5 {
        for i in 0..100 {
            let key = format!("key_{:02}_{:03}", batch, i);
            let value = format!("value_{:02}_{:03}", batch, i);
            db.put(key.as_bytes(), value.as_bytes()).unwrap();
        }
        db.flush().unwrap();
    }

    // Allow background compaction to run
    thread::sleep(Duration::from_millis(500));

    // Verify all data preserved
    for batch in 0..5 {
        for i in 0..100 {
            let key = format!("key_{:02}_{:03}", batch, i);
            let expected_value = format!("value_{:02}_{:03}", batch, i);
            let value = db
                .get(key.as_bytes())
                .unwrap()
                .unwrap_or_else(|| panic!("Key {} should exist after compaction", key));
            assert_eq!(
                value.as_ref(),
                expected_value.as_bytes(),
                "Value for {} should be preserved",
                key
            );
        }
    }

    // Reopen and verify again
    drop(db);
    let db = DB::open(&db_path).unwrap();

    for batch in 0..5 {
        for i in 0..100 {
            let key = format!("key_{:02}_{:03}", batch, i);
            assert!(
                db.get(key.as_bytes()).unwrap().is_some(),
                "Key {} should exist after reopen",
                key
            );
        }
    }
}

// =============================================================================
// 4. CONCURRENT OPERATION TESTS
// =============================================================================

/// Test reading during memtable swap (flush)
///
/// Risk: MEDIUM - Reader may see inconsistent state
/// Expected: Reads always return correct value or None
#[test]
fn test_read_during_memtable_swap() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = Arc::new(
        DBOptions::default()
            .memtable_capacity(4096) // Small to trigger swaps
            .open(&db_path)
            .unwrap(),
    );

    // Pre-populate some keys
    for i in 0..100 {
        db.put(format!("pre_{:03}", i).as_bytes(), b"pre_value")
            .unwrap();
    }

    let stop = Arc::new(AtomicBool::new(false));
    let error_count = Arc::new(AtomicUsize::new(0));

    // Reader thread - continuously reads pre-populated keys
    let db_reader = Arc::clone(&db);
    let stop_reader = Arc::clone(&stop);
    let error_count_reader = Arc::clone(&error_count);
    let reader = thread::spawn(move || {
        while !stop_reader.load(Ordering::Relaxed) {
            for i in 0..100 {
                match db_reader.get(format!("pre_{:03}", i).as_bytes()) {
                    Ok(Some(value)) => {
                        if value.as_ref() != b"pre_value" {
                            error_count_reader.fetch_add(1, Ordering::Relaxed);
                        }
                    }
                    Ok(None) => {
                        // Key missing during swap - could be acceptable depending on isolation level
                    }
                    Err(_) => {
                        error_count_reader.fetch_add(1, Ordering::Relaxed);
                    }
                }
            }
        }
    });

    // Writer thread - triggers memtable swaps
    let db_writer = Arc::clone(&db);
    let stop_writer = Arc::clone(&stop);
    let writer = thread::spawn(move || {
        let mut i = 0;
        while !stop_writer.load(Ordering::Relaxed) && i < 10000 {
            let _ = db_writer.put(format!("write_{:06}", i).as_bytes(), b"write_value");
            i += 1;
        }
    });

    // Run for 2 seconds
    thread::sleep(Duration::from_secs(2));
    stop.store(true, Ordering::Relaxed);

    reader.join().unwrap();
    writer.join().unwrap();

    assert_eq!(
        error_count.load(Ordering::Relaxed),
        0,
        "Should have no read errors during memtable swap"
    );
}

/// Test concurrent put and delete on same key
///
/// Risk: MEDIUM - Race could cause inconsistent state
/// Expected: Final state is either value or deleted, not corrupted
#[test]
fn test_concurrent_put_delete_same_key() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = Arc::new(DB::open(&db_path).unwrap());

    let iterations = 1000;

    // Thread 1: puts
    let db1 = Arc::clone(&db);
    let t1 = thread::spawn(move || {
        for _ in 0..iterations {
            let _ = db1.put(b"contested_key", b"put_value");
        }
    });

    // Thread 2: deletes
    let db2 = Arc::clone(&db);
    let t2 = thread::spawn(move || {
        for _ in 0..iterations {
            let _ = db2.delete(b"contested_key");
        }
    });

    t1.join().unwrap();
    t2.join().unwrap();

    // Final state should be consistent
    match db.get(b"contested_key").unwrap() {
        Some(value) => {
            assert_eq!(
                value.as_ref(),
                b"put_value",
                "If present, value should be 'put_value'"
            );
        }
        None => {
            // Deleted - also valid
        }
    }

    // Flush and reopen to verify persistence
    db.flush().unwrap();
    drop(db);

    let db = DB::open(&db_path).unwrap();

    // State should still be consistent
    match db.get(b"contested_key").unwrap() {
        Some(value) => {
            assert_eq!(value.as_ref(), b"put_value");
        }
        None => {}
    }
}

// =============================================================================
// 5. EDGE CASE TESTS
// =============================================================================

/// Test empty value throughout lifecycle
///
/// Risk: MEDIUM - Empty value could be confused with tombstone/missing
/// Expected: Empty value preserved through flush, compact, recover
#[test]
fn test_empty_value_full_lifecycle() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = DBOptions::default()
        .memtable_capacity(2048)
        .background_compaction(true) // Enable compaction
        .open(&db_path)
        .unwrap();

    // Put empty value
    db.put(b"empty_value_key", b"").unwrap();

    // Verify in memtable
    let value = db.get(b"empty_value_key").unwrap();
    assert!(value.is_some(), "Empty value should be retrievable");
    assert_eq!(value.unwrap().as_ref(), b"", "Value should be empty");

    // Flush to SSTable
    db.flush().unwrap();

    // Verify in SSTable
    let value = db.get(b"empty_value_key").unwrap();
    assert!(value.is_some(), "Empty value should exist after flush");
    assert_eq!(value.unwrap().as_ref(), b"", "Value should still be empty");

    // Add more data to trigger compaction
    for batch in 0..5 {
        for i in 0..50 {
            db.put(format!("filler_{}_{:03}", batch, i).as_bytes(), b"filler")
                .unwrap();
        }
        db.flush().unwrap();
    }

    // Allow background compaction
    thread::sleep(Duration::from_millis(500));

    // Verify after compaction
    let value = db.get(b"empty_value_key").unwrap();
    assert!(value.is_some(), "Empty value should exist after compaction");
    assert_eq!(
        value.unwrap().as_ref(),
        b"",
        "Value should be empty after compaction"
    );

    // Reopen and verify
    drop(db);
    let db = DB::open(&db_path).unwrap();

    let value = db.get(b"empty_value_key").unwrap();
    assert!(value.is_some(), "Empty value should exist after reopen");
    assert_eq!(
        value.unwrap().as_ref(),
        b"",
        "Value should be empty after reopen"
    );
}

/// Test key with null bytes
///
/// Risk: LOW - Binary keys must be handled correctly
/// Expected: Key with null bytes stored and retrieved correctly
#[test]
fn test_key_with_null_bytes() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = DB::open(&db_path).unwrap();

    // Key with embedded nulls
    let key_with_nulls = b"key\x00with\x00nulls";
    let value = b"value_for_null_key";

    db.put(key_with_nulls, value).unwrap();

    // Verify retrieval
    let retrieved = db.get(key_with_nulls).unwrap().unwrap();
    assert_eq!(retrieved.as_ref(), value);

    // Flush and verify
    db.flush().unwrap();
    let retrieved = db.get(key_with_nulls).unwrap().unwrap();
    assert_eq!(retrieved.as_ref(), value);

    // Reopen and verify
    drop(db);
    let db = DB::open(&db_path).unwrap();

    let retrieved = db.get(key_with_nulls).unwrap().unwrap();
    assert_eq!(retrieved.as_ref(), value);
}

/// Test value at vLog threshold boundary
///
/// Risk: LOW - Boundary condition in value separation
/// Expected: Values at exact threshold handled correctly
#[test]
fn test_value_at_vlog_threshold() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let threshold = 1024; // 1KB threshold

    let db = DBOptions::default()
        .vlog_threshold(Some(threshold))
        .open(&db_path)
        .unwrap();

    // Value exactly at threshold
    let value_at = vec![b'a'; threshold];
    db.put(b"key_at", &value_at).unwrap();

    // Value one byte over threshold (goes to vLog)
    let value_over = vec![b'b'; threshold + 1];
    db.put(b"key_over", &value_over).unwrap();

    // Value one byte under threshold (inline)
    let value_under = vec![b'c'; threshold - 1];
    db.put(b"key_under", &value_under).unwrap();

    // Flush to SSTable
    db.flush().unwrap();

    // Verify all values correct
    assert_eq!(db.get(b"key_at").unwrap().unwrap().as_ref(), &value_at[..]);
    assert_eq!(
        db.get(b"key_over").unwrap().unwrap().as_ref(),
        &value_over[..]
    );
    assert_eq!(
        db.get(b"key_under").unwrap().unwrap().as_ref(),
        &value_under[..]
    );

    // Reopen and verify
    drop(db);
    let db = DBOptions::default()
        .vlog_threshold(Some(threshold))
        .open(&db_path)
        .unwrap();

    assert_eq!(db.get(b"key_at").unwrap().unwrap().as_ref(), &value_at[..]);
    assert_eq!(
        db.get(b"key_over").unwrap().unwrap().as_ref(),
        &value_over[..]
    );
    assert_eq!(
        db.get(b"key_under").unwrap().unwrap().as_ref(),
        &value_under[..]
    );
}

/// Test many versions of same key (MVCC stress)
///
/// Risk: MEDIUM - Version chain could be corrupted
/// Expected: Always see latest version
#[test]
fn test_many_versions_same_key() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = DBOptions::default()
        .memtable_capacity(4096)
        .background_compaction(true) // Enable compaction
        .open(&db_path)
        .unwrap();

    // Write many versions (reduced from 1000 for faster CI)
    for version in 0..100 {
        let value = format!("version_{:03}", version);
        db.put(b"versioned_key", value.as_bytes()).unwrap();

        // Occasionally flush to create SSTable versions
        if version % 25 == 24 {
            db.flush().unwrap();
        }
    }

    // Should always see latest version
    let value = db.get(b"versioned_key").unwrap().unwrap();
    assert_eq!(value.as_ref(), b"version_099", "Should see latest version");

    // Allow compaction and verify
    thread::sleep(Duration::from_millis(500));
    let value = db.get(b"versioned_key").unwrap().unwrap();
    assert_eq!(
        value.as_ref(),
        b"version_099",
        "Latest version should survive compaction"
    );

    // Reopen and verify
    drop(db);
    let db = DB::open(&db_path).unwrap();

    let value = db.get(b"versioned_key").unwrap().unwrap();
    assert_eq!(
        value.as_ref(),
        b"version_099",
        "Latest version should survive reopen"
    );
}

// =============================================================================
// 6. SNAPSHOT ISOLATION TESTS
// =============================================================================

/// Test snapshot sees consistent point-in-time view
///
/// Risk: MEDIUM - Snapshot could see partial writes
/// Expected: Snapshot always sees consistent state
#[test]
fn test_snapshot_consistency_during_writes() {
    let temp_dir = TempDir::new().unwrap();
    let db_path = temp_dir.path().to_path_buf();

    let db = Arc::new(DB::open(&db_path).unwrap());

    // Write initial data
    for i in 0..100 {
        db.put(format!("key_{:03}", i).as_bytes(), b"v1").unwrap();
    }

    // Take snapshot
    let snapshot = db.snapshot().unwrap();

    // Update all keys
    for i in 0..100 {
        db.put(format!("key_{:03}", i).as_bytes(), b"v2").unwrap();
    }

    // Snapshot should see old values
    for i in 0..100 {
        let value = snapshot
            .get(format!("key_{:03}", i).as_bytes())
            .unwrap()
            .unwrap();
        assert_eq!(
            value.as_ref(),
            b"v1",
            "Snapshot should see v1 for key {}",
            i
        );
    }

    // Current DB should see new values
    for i in 0..100 {
        let value = db.get(format!("key_{:03}", i).as_bytes()).unwrap().unwrap();
        assert_eq!(value.as_ref(), b"v2", "DB should see v2 for key {}", i);
    }
}

// =============================================================================
// 6. PERSISTENCE TESTS
// =============================================================================

/// Test that 100 keys persist correctly across reopen
///
/// Risk: CRITICAL - Data loss bug reported
/// Suspected cause: Partitioned memtable flush/recovery issue
#[test]
fn test_persistence_100_keys() {
    let temp_dir = TempDir::new().unwrap();
    let path = temp_dir.path().to_path_buf();

    // Write 100 keys
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .background_compaction(false)
            .background_flush(false)
            .open(&path)
            .unwrap();

        for i in 0..100 {
            let key = format!("v:{}", i);
            let value = vec![i as u8; 128];
            db.put(key.as_bytes(), &value).unwrap();
        }
        db.flush().unwrap();
    }

    // Reopen and verify
    {
        let db = DBOptions::default()
            .background_compaction(false)
            .background_flush(false)
            .open(&path)
            .unwrap();

        let mut missing = Vec::new();
        for i in 0..100 {
            let key = format!("v:{}", i);
            if db.get(key.as_bytes()).unwrap().is_none() {
                missing.push(i);
            }
        }

        assert!(
            missing.is_empty(),
            "Missing {} keys after reopen: {:?}",
            missing.len(),
            missing
        );
    }
}

/// Test persistence with multiple key prefixes (omendb pattern)
///
/// Risk: CRITICAL - Data loss with mixed prefixes
/// Pattern: v:{id}, m:{id}, i:{string_id} keys interleaved
#[test]
fn test_persistence_multiple_prefixes() {
    let temp_dir = TempDir::new().unwrap();
    let path = temp_dir.path().to_path_buf();

    // Write 100 items with 3 keys each (300 total keys)
    {
        let db = DBOptions::default()
            .sync_policy(SyncPolicy::SyncAll)
            .background_compaction(false)
            .background_flush(false)
            .open(&path)
            .unwrap();

        for i in 0..100u64 {
            // Vector data prefix
            let v_key = format!("v:{}", i);
            let v_value = vec![i as u8; 128];
            db.put(v_key.as_bytes(), &v_value).unwrap();

            // Metadata prefix
            let m_key = format!("m:{}", i);
            let m_value = format!(r#"{{"index":{}}}"#, i);
            db.put(m_key.as_bytes(), m_value.as_bytes()).unwrap();

            // ID mapping prefix
            let i_key = format!("i:item{}", i);
            let i_value = i.to_le_bytes();
            db.put(i_key.as_bytes(), &i_value).unwrap();
        }
        db.flush().unwrap();
    }

    // Debug: list SSTable files
    let sst_files: Vec<_> = fs::read_dir(&path)
        .unwrap()
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().map_or(false, |ext| ext == "sst"))
        .collect();
    eprintln!(
        "SSTable files after flush: {:?}",
        sst_files.iter().map(|e| e.path()).collect::<Vec<_>>()
    );

    // Reopen and verify all 300 keys present
    {
        let db = DBOptions::default()
            .background_compaction(false)
            .background_flush(false)
            .open(&path)
            .unwrap();

        let mut missing_v = Vec::new();
        let mut missing_m = Vec::new();
        let mut missing_i = Vec::new();

        // Also collect all keys via scan to compare
        let mut scanned_v_keys = Vec::new();
        for item in db.range(b"v:", Some(b"v:\xff")).unwrap() {
            if let Ok((k, _)) = item {
                if k.starts_with(b"v:") {
                    scanned_v_keys.push(String::from_utf8_lossy(&k).to_string());
                }
            }
        }
        eprintln!("Scanned v: keys count: {}", scanned_v_keys.len());

        for i in 0..100u64 {
            let v_key = format!("v:{}", i);
            let result = db.get(v_key.as_bytes()).unwrap();
            if result.is_none() {
                missing_v.push(i);
            }

            let m_key = format!("m:{}", i);
            if db.get(m_key.as_bytes()).unwrap().is_none() {
                missing_m.push(i);
            }

            let i_key = format!("i:item{}", i);
            if db.get(i_key.as_bytes()).unwrap().is_none() {
                missing_i.push(i);
            }
        }

        let total_missing = missing_v.len() + missing_m.len() + missing_i.len();
        assert!(
            total_missing == 0,
            "Missing {} keys after reopen:\n  v: {:?}\n  m: {:?}\n  i: {:?}",
            total_missing,
            missing_v,
            missing_m,
            missing_i
        );
    }
}