edgestore 1.0.10

Local-first embedded KV + vector database in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
use edgestore::{
    EdgestoreConfig, Engine, TextEngine, VectorEngine,
    text::types::{FacetValue, TextRecord},
};
use tempfile::TempDir;

fn open_engine(dir: &TempDir) -> Engine {
    Engine::open(EdgestoreConfig::new(dir.path())).unwrap()
}

#[test]
fn test_index_and_search_basic() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "The quick brown fox", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc2", "The lazy dog sleeps", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc3", "Quick brown fox jumps", std::collections::HashMap::new()).unwrap();

    let results = engine.search_text(b"ns", "quick brown", 3).unwrap();
    assert!(!results.is_empty(), "search should return results");
    // Both doc1 and doc3 have "quick" and "brown"; they should be in results
    assert!(results.iter().any(|r| r.doc_id == b"doc1"), "doc1 should match 'quick brown'");
    assert!(results.iter().any(|r| r.doc_id == b"doc3"), "doc3 should match 'quick brown'");
}

#[test]
fn test_bm25_ranking() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    // Doc with term appearing twice should rank higher
    engine.index_text(b"ns", b"doc1", "hello hello world", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc2", "hello world", std::collections::HashMap::new()).unwrap();

    let results = engine.search_text(b"ns", "hello", 2).unwrap();
    assert_eq!(results.len(), 2);
    assert_eq!(results[0].doc_id, b"doc1", "doc with more 'hello' should rank higher");
    assert!(results[0].score > results[1].score);
}

#[test]
fn test_search_empty_namespace() {
    let dir = TempDir::new().unwrap();
    let engine = open_engine(&dir);

    let results = engine.search_text(b"ns", "hello", 5).unwrap();
    assert!(results.is_empty());
}

#[test]
fn test_search_empty_query() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();

    let results = engine.search_text(b"ns", "", 5).unwrap();
    assert!(results.is_empty());

    let results2 = engine.search_text(b"ns", "the a an", 5).unwrap();
    assert!(results2.is_empty(), "stopwords-only query should return empty");
}

#[test]
fn test_delete_removes_from_search() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
    let results_before = engine.search_text(b"ns", "hello", 5).unwrap();
    assert_eq!(results_before.len(), 1);

    engine.delete_text(b"ns", b"doc1").unwrap();
    let results_after = engine.search_text(b"ns", "hello", 5).unwrap();
    assert!(results_after.is_empty(), "deleted doc should not appear in search");
}

#[test]
fn test_facet_filter() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    let mut facets1 = std::collections::HashMap::new();
    facets1.insert("category".to_string(), FacetValue::String("news".to_string()));
    engine.index_text(b"ns", b"doc1", "breaking news today", facets1).unwrap();

    let mut facets2 = std::collections::HashMap::new();
    facets2.insert("category".to_string(), FacetValue::String("sports".to_string()));
    engine.index_text(b"ns", b"doc2", "sports update", facets2).unwrap();

    // Search without facet filter should find both
    let results = engine.search_text(b"ns", "news", 5).unwrap();
    assert_eq!(results.len(), 1);
    assert_eq!(results[0].doc_id, b"doc1");
}

#[test]
fn test_search_ranking_stability() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "alpha beta gamma", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc2", "beta gamma delta", std::collections::HashMap::new()).unwrap();

    let results1 = engine.search_text(b"ns", "beta gamma", 5).unwrap();
    let results2 = engine.search_text(b"ns", "beta gamma", 5).unwrap();

    assert_eq!(results1.len(), results2.len());
    for (a, b) in results1.iter().zip(results2.iter()) {
        assert_eq!(a.doc_id, b.doc_id);
        assert!((a.score - b.score).abs() < 1e-6);
    }
}

#[test]
fn test_index_text_record_retrieval() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    let mut facets = std::collections::HashMap::new();
    facets.insert("author".to_string(), FacetValue::String("Alice".to_string()));
    facets.insert("views".to_string(), FacetValue::Number(42));
    facets.insert("published".to_string(), FacetValue::Bool(true));

    engine.index_text(b"ns", b"doc1", "hello world", facets.clone()).unwrap();

    // Retrieve the raw text record via plain KV get
    let text_ns = edgestore::text_namespace(b"ns");
    let raw = engine.get(&text_ns, b"doc1").unwrap().unwrap();
    let record = edgestore::decode_text_record(&raw).unwrap();
    assert_eq!(record.text, "hello world");
    assert_eq!(record.facets.get("author"), Some(&FacetValue::String("Alice".to_string())));
    assert_eq!(record.facets.get("views"), Some(&FacetValue::Number(42)));
    assert_eq!(record.facets.get("published"), Some(&FacetValue::Bool(true)));
}

#[test]
fn test_reindex_updates_merged_index() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    // Index doc1 with "hello world"
    engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
    let results = engine.search_text(b"ns", "hello", 5).unwrap();
    assert_eq!(results.len(), 1);
    assert_eq!(results[0].doc_id, b"doc1");

    // Re-index doc1 with "foo bar" — old terms should be gone
    engine.index_text(b"ns", b"doc1", "foo bar", std::collections::HashMap::new()).unwrap();
    let results_hello = engine.search_text(b"ns", "hello", 5).unwrap();
    assert!(results_hello.is_empty(), "old term 'hello' should not find re-indexed doc");

    let results_foo = engine.search_text(b"ns", "foo", 5).unwrap();
    assert_eq!(results_foo.len(), 1);
    assert_eq!(results_foo[0].doc_id, b"doc1");
}

#[test]
fn test_incremental_index_many_docs() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    // Index 100 docs incrementally
    for i in 0..100 {
        let text = format!("document number {} contains quick brown fox", i);
        let key = format!("doc{:04}", i);
        engine.index_text(b"ns", key.as_bytes(), &text, std::collections::HashMap::new()).unwrap();
    }

    // Search should find all docs with "quick brown"
    let results = engine.search_text(b"ns", "quick brown", 200).unwrap();
    assert_eq!(results.len(), 100, "all 100 docs should match 'quick brown'");

    // Delete every other doc
    for i in (0..100).step_by(2) {
        let key = format!("doc{:04}", i);
        engine.delete_text(b"ns", key.as_bytes()).unwrap();
    }

    // Search should find only remaining 50 docs
    let results_after = engine.search_text(b"ns", "quick brown", 200).unwrap();
    assert_eq!(results_after.len(), 50, "50 docs should remain after deletion");
}

#[test]
fn test_namespace_isolation() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns1", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns2", b"doc1", "foo bar", std::collections::HashMap::new()).unwrap();

    let results1 = engine.search_text(b"ns1", "hello", 5).unwrap();
    assert_eq!(results1.len(), 1);

    let results2 = engine.search_text(b"ns2", "hello", 5).unwrap();
    assert!(results2.is_empty(), "ns2 should not find ns1 terms");

    let results3 = engine.search_text(b"ns2", "foo", 5).unwrap();
    assert_eq!(results3.len(), 1);
}

#[test]
fn test_delete_all_docs_removes_index() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc2", "hello world", std::collections::HashMap::new()).unwrap();

    engine.delete_text(b"ns", b"doc1").unwrap();
    engine.delete_text(b"ns", b"doc2").unwrap();

    // Merged index should be deleted when empty
    let text_ns = edgestore::text_namespace(b"ns");
    let index_bytes = engine.get(&text_ns, b"__index__").unwrap();
    assert!(index_bytes.is_none(), "merged index should be deleted when all docs removed");
}

#[test]
fn test_search_performance_at_scale() {
    use std::time::Instant;

    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    // Index 10,000 documents
    let n = 10_000;
    for i in 0..n {
        let text = format!("document number {} contains quick brown fox jumps over lazy dog", i);
        let key = format!("doc{:08}", i);
        engine.index_text(b"ns", key.as_bytes(), &text, std::collections::HashMap::new()).unwrap();
    }

    // Benchmark 100 searches
    let start = Instant::now();
    for _ in 0..100 {
        let results = engine.search_text(b"ns", "quick brown fox", 10).unwrap();
        assert!(!results.is_empty());
    }
    let elapsed = start.elapsed();
    let avg_us = elapsed.as_micros() as f64 / 100.0;

    // Threshold depends on build profile:
    //   release: ~3-5 ms (BENCHMARKS.md claim: ~3.2 ms at 10K docs)
    //   debug:   ~15-25 ms (debug overhead; still 6-10x faster than old ~165 ms)
    let threshold_us = if cfg!(debug_assertions) { 50_000.0 } else { 5_000.0 };
    assert!(
        avg_us < threshold_us,
        "search too slow: {:.1} µs at {} docs (threshold: {:.0} µs)",
        avg_us, n, threshold_us
    );
}

#[test]
fn test_cold_cache_search() {
    let dir = TempDir::new().unwrap();

    // Phase 1: index docs with one engine instance
    {
        let mut engine = open_engine(&dir);
        engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
        engine.index_text(b"ns", b"doc2", "hello foo", std::collections::HashMap::new()).unwrap();
        engine.flush().unwrap();
    }

    // Phase 2: drop engine, reopen — cache is cold. Search should still work
    // via fallback disk read.
    {
        let engine = open_engine(&dir);
        let results = engine.search_text(b"ns", "hello", 5).unwrap();
        assert_eq!(results.len(), 2, "cold-cache search should find both docs via disk fallback");
    }
}

#[test]
fn test_typo_tolerance() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
    engine.index_text(b"ns", b"doc2", "helo there", std::collections::HashMap::new()).unwrap();

    // Exact search finds both ("hello" exact, "helo" is one edit away)
    let exact = engine.search_text_with_options(
        b"ns",
        "hello",
        &edgestore::SearchOptions { k: 5, typo_tolerance: true, ..Default::default() },
    ).unwrap();
    assert!(
        exact.iter().any(|r| r.doc_id == b"doc1"),
        "exact match doc1 should be found"
    );
    assert!(
        exact.iter().any(|r| r.doc_id == b"doc2"),
        "typo-tolerant match doc2 ('helo' ~ 'hello') should be found"
    );
}

#[test]
fn test_delete_fallback_cache_miss() {
    let dir = TempDir::new().unwrap();

    // Index with engine 1
    {
        let mut engine = open_engine(&dir);
        engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
        engine.flush().unwrap();
    }

    // Delete with engine 2 (cold cache — simulates cache miss)
    {
        let mut engine = open_engine(&dir);
        let results_before = engine.search_text(b"ns", "hello", 5).unwrap();
        assert_eq!(results_before.len(), 1);

        engine.delete_text(b"ns", b"doc1").unwrap();

        let results_after = engine.search_text(b"ns", "hello", 5).unwrap();
        assert!(results_after.is_empty(), "delete from cold cache should remove doc");
    }
}

#[test]
fn test_reindex_with_facets() {
    let dir = TempDir::new().unwrap();
    let mut engine = open_engine(&dir);

    let mut facets1 = std::collections::HashMap::new();
    facets1.insert("category".to_string(), FacetValue::String("news".to_string()));
    engine.index_text(b"ns", b"doc1", "breaking news today", facets1).unwrap();

    // Re-index with different facets
    let mut facets2 = std::collections::HashMap::new();
    facets2.insert("category".to_string(), FacetValue::String("sports".to_string()));
    engine.index_text(b"ns", b"doc1", "sports update today", facets2).unwrap();

    // Old text should not match "breaking"
    let results = engine.search_text(b"ns", "breaking", 5).unwrap();
    assert!(results.is_empty(), "old text 'breaking' should not match after re-index");

    // New text should match "sports"
    let results2 = engine.search_text(b"ns", "sports", 5).unwrap();
    assert_eq!(results2.len(), 1);
    assert_eq!(results2[0].doc_id, b"doc1");

    // Verify raw record has new facets
    let text_ns = edgestore::text_namespace(b"ns");
    let raw = engine.get(&text_ns, b"doc1").unwrap().unwrap();
    let record = edgestore::decode_text_record(&raw).unwrap();
    assert_eq!(
        record.facets.get("category"),
        Some(&FacetValue::String("sports".to_string()))
    );
}

#[test]
fn test_crash_recovery_rebuilds_stale_sidecar() {
    let dir = TempDir::new().unwrap();

    // Phase 1: index doc1, flush() → sidecar gets LSN X
    {
        let mut engine = open_engine(&dir);
        engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
        engine.flush().unwrap();
    }

    // Phase 2: reopen, index doc2 (no flush), drop (simulates crash)
    {
        let mut engine = open_engine(&dir);
        engine.index_text(b"ns", b"doc2", "hello foo", std::collections::HashMap::new()).unwrap();
        // NO flush — sidecar still has LSN X, but doc2 has LSN > X
    }

    // Phase 3: reopen after "crash"
    // rebuild_text_indices() must detect sidecar is stale (sidecar_lsn < max_lsn)
    // and rebuild from raw records so both doc1 and doc2 are searchable.
    {
        let engine = open_engine(&dir);
        let results = engine.search_text(b"ns", "hello", 5).unwrap();
        assert_eq!(results.len(), 2, "crash recovery must rebuild stale sidecar so both docs are searchable");
        assert!(results.iter().any(|r| r.doc_id == b"doc1"));
        assert!(results.iter().any(|r| r.doc_id == b"doc2"));
    }
}

#[test]
fn test_no_rebuild_when_sidecar_fresh() {
    let dir = TempDir::new().unwrap();

    // Phase 1: index doc1, flush(), index doc2, flush()
    {
        let mut engine = open_engine(&dir);
        engine.index_text(b"ns", b"doc1", "hello world", std::collections::HashMap::new()).unwrap();
        engine.flush().unwrap();
        engine.index_text(b"ns", b"doc2", "hello foo", std::collections::HashMap::new()).unwrap();
        engine.flush().unwrap();
    }

    // Phase 2: reopen — sidecar should be fresh (sidecar_lsn == max_lsn)
    // No rebuild needed; both docs searchable immediately.
    {
        let engine = open_engine(&dir);
        let results = engine.search_text(b"ns", "hello", 5).unwrap();
        assert_eq!(results.len(), 2, "fresh sidecar should contain both docs without rebuild");
    }
}