1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
//! Sketch track extensions for `Memvid`.
//!
//! This module provides methods for fast candidate generation using
//! per-frame sketches (`SimHash` + term filters). Sketches enable sub-millisecond
//! candidate filtering before expensive BM25/vector reranking.
//!
//! Key operations:
//! - `insert_sketch`: Add a sketch for a frame
//! - `build_all_sketches`: Generate sketches for all existing frames
//! - `find_sketch_candidates`: Find candidate frames matching a query
//! - `sketch_stats`: Get statistics about the sketch track
use crate::memvid::lifecycle::Memvid;
use crate::types::{
DEFAULT_HAMMING_THRESHOLD, FrameId, QuerySketch, SketchEntry, SketchTrack, SketchTrackStats,
SketchVariant, generate_sketch,
};
/// Result of a sketch candidate search.
#[derive(Debug, Clone)]
pub struct SketchCandidate {
/// Frame ID.
pub frame_id: FrameId,
/// Sketch score (higher is better, 0.0-1.0).
pub score: f32,
/// Hamming distance from query `SimHash`.
pub hamming_distance: u32,
/// Number of matching top terms.
pub matching_top_terms: usize,
}
/// Options for sketch candidate search.
#[derive(Debug, Clone)]
pub struct SketchSearchOptions {
/// Maximum Hamming distance to consider (default: 10).
pub hamming_threshold: u32,
/// Maximum number of candidates to return (default: 2000).
pub max_candidates: usize,
/// Minimum score threshold (default: 0.0).
pub min_score: f32,
}
impl Default for SketchSearchOptions {
fn default() -> Self {
Self {
hamming_threshold: DEFAULT_HAMMING_THRESHOLD,
max_candidates: 2000,
min_score: 0.0,
}
}
}
/// Detailed statistics from a sketch search.
#[derive(Debug, Clone)]
pub struct SketchSearchStats {
/// Total frames scanned.
pub frames_scanned: usize,
/// Frames passing term filter.
pub term_filter_hits: usize,
/// Frames passing `SimHash` threshold.
pub simhash_hits: usize,
/// Final candidates returned.
pub candidates_returned: usize,
/// Scan time in microseconds.
pub scan_us: u64,
}
impl Memvid {
/// Get an immutable reference to the sketch track.
#[must_use]
pub fn sketches(&self) -> &SketchTrack {
&self.sketch_track
}
/// Get a mutable reference to the sketch track.
pub fn sketches_mut(&mut self) -> &mut SketchTrack {
self.dirty = true;
&mut self.sketch_track
}
/// Check if the sketch track has any entries.
#[must_use]
pub fn has_sketches(&self) -> bool {
!self.sketch_track.is_empty()
}
/// Get statistics about the sketch track.
#[must_use]
pub fn sketch_stats(&self) -> SketchTrackStats {
self.sketch_track.stats()
}
/// Insert a sketch for a frame.
///
/// # Arguments
/// * `frame_id` - The frame ID to create a sketch for
/// * `text` - The text content to sketch (`search_text` or payload)
/// * `variant` - The sketch variant to use (Small recommended)
///
/// # Returns
/// The generated sketch entry.
pub fn insert_sketch(
&mut self,
frame_id: FrameId,
text: &str,
variant: SketchVariant,
) -> SketchEntry {
let entry = generate_sketch(frame_id, text, variant, None);
self.sketch_track.insert(entry.clone());
self.dirty = true;
entry
}
/// Build sketches for all frames that don't have one yet.
///
/// This scans all active frames and generates sketches using each frame's
/// `search_text` field.
///
/// # Arguments
/// * `variant` - The sketch variant to use for new sketches
///
/// # Returns
/// Number of new sketches generated.
pub fn build_all_sketches(&mut self, variant: SketchVariant) -> usize {
let mut count = 0;
// Collect frames that need sketches
let frames_to_sketch: Vec<(FrameId, String)> = self
.toc
.frames
.iter()
.filter(|f| f.status == crate::types::FrameStatus::Active)
.filter(|f| self.sketch_track.get(f.id).is_none())
.filter_map(|f| {
f.search_text
.clone()
.filter(|t| !t.is_empty())
.map(|text| (f.id, text))
})
.collect();
for (frame_id, text) in frames_to_sketch {
self.insert_sketch(frame_id, &text, variant);
count += 1;
}
if count > 0 {
self.dirty = true;
}
count
}
/// Find candidate frames matching a query using sketch filtering.
///
/// This performs a fast two-stage filter:
/// 1. Term filter: reject frames that can't possibly contain query terms
/// 2. `SimHash`: reject frames with large Hamming distance from query
///
/// Candidates should be reranked with BM25 or vector similarity for final results.
///
/// # Arguments
/// * `query` - The query text to search for
/// * `options` - Search options (or use default)
///
/// # Returns
/// List of candidate frames with scores, sorted by score descending.
#[must_use]
pub fn find_sketch_candidates(
&self,
query: &str,
options: Option<SketchSearchOptions>,
) -> Vec<SketchCandidate> {
let opts = options.unwrap_or_default();
// Build query sketch using same variant as track
let query_sketch = QuerySketch::from_query(query, self.sketch_track.variant);
// Find candidates
let raw_candidates = self.sketch_track.find_candidates(
&query_sketch,
opts.hamming_threshold,
opts.max_candidates,
);
// Convert to SketchCandidate with additional details
raw_candidates
.into_iter()
.filter(|(_, score)| *score >= opts.min_score)
.map(|(frame_id, score)| {
let entry = self.sketch_track.get(frame_id);
let hamming_distance =
entry.map_or(64, |e| e.hamming_distance(query_sketch.simhash));
let matching_top_terms =
entry.map_or(0, |e| e.count_matching_top_terms(&query_sketch.top_terms));
SketchCandidate {
frame_id,
score,
hamming_distance,
matching_top_terms,
}
})
.collect()
}
/// Find candidates with detailed statistics for debugging/explain mode.
#[must_use]
pub fn find_sketch_candidates_with_stats(
&self,
query: &str,
options: Option<SketchSearchOptions>,
) -> (Vec<SketchCandidate>, SketchSearchStats) {
let start = std::time::Instant::now();
let opts = options.unwrap_or_default();
let query_sketch = QuerySketch::from_query(query, self.sketch_track.variant);
let frames_scanned = self.sketch_track.len();
let mut term_filter_hits = 0usize;
let mut simhash_hits = 0usize;
// Manual scan for stats collection
let mut candidates: Vec<(FrameId, f32)> = Vec::new();
for entry in self.sketch_track.iter() {
// Term filter check
if !entry.term_filter_maybe_overlaps(&query_sketch.term_filter) {
continue;
}
term_filter_hits += 1;
// SimHash check
let hamming = entry.hamming_distance(query_sketch.simhash);
if hamming > opts.hamming_threshold {
continue;
}
simhash_hits += 1;
// Score
if let Some(score) = query_sketch.score_entry(entry, opts.hamming_threshold) {
if score >= opts.min_score {
candidates.push((entry.frame_id, score));
}
}
}
// Sort by score descending
candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
candidates.truncate(opts.max_candidates);
let candidates_returned = candidates.len();
let result: Vec<SketchCandidate> = candidates
.into_iter()
.map(|(frame_id, score)| {
let entry = self.sketch_track.get(frame_id);
let hamming_distance =
entry.map_or(64, |e| e.hamming_distance(query_sketch.simhash));
let matching_top_terms =
entry.map_or(0, |e| e.count_matching_top_terms(&query_sketch.top_terms));
SketchCandidate {
frame_id,
score,
hamming_distance,
matching_top_terms,
}
})
.collect();
let stats = SketchSearchStats {
frames_scanned,
term_filter_hits,
simhash_hits,
candidates_returned,
scan_us: u64::try_from(start.elapsed().as_micros()).unwrap_or(u64::MAX),
};
(result, stats)
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::NamedTempFile;
#[test]
fn test_sketch_insert_and_query() {
let tmp = NamedTempFile::new().expect("tempfile");
let path = tmp.path();
// Create a new memory
let mut mem = Memvid::create(path).expect("create");
// Insert some test sketches
mem.insert_sketch(0, "cats are wonderful pets", SketchVariant::Small);
mem.insert_sketch(1, "dogs are loyal companions", SketchVariant::Small);
mem.insert_sketch(2, "programming in rust language", SketchVariant::Small);
// Query for cats
let candidates = mem.find_sketch_candidates("cats pets", None);
// Should find some candidates
assert!(!candidates.is_empty() || !mem.sketches().is_empty());
}
#[test]
fn test_sketch_candidate_speed() {
let tmp = NamedTempFile::new().expect("tempfile");
let path = tmp.path();
let mut mem = Memvid::create(path).expect("create");
// Insert many sketches for a realistic benchmark
let docs = [
"machine learning artificial intelligence neural networks deep learning",
"rust programming language memory safety systems programming",
"cloud computing kubernetes containers docker orchestration",
"database optimization indexing query planning performance",
"web development react vue javascript frontend frameworks",
];
for i in 0..1000 {
let doc = docs[i % docs.len()];
mem.insert_sketch(i as u64, doc, SketchVariant::Small);
}
// Benchmark: run query with stats
let query = "machine learning neural networks";
// Use relaxed threshold (32 instead of default 10) for better recall
let options = Some(SketchSearchOptions {
hamming_threshold: 32, // Half of 64 bits
max_candidates: 100,
min_score: 0.0,
});
let (candidates, stats) = mem.find_sketch_candidates_with_stats(query, options);
// Print benchmark results
println!("\n=== Sketch Candidate Benchmark ===");
println!(" Documents: 1000");
println!(" Query: \"{}\"", query);
println!(" Scanned: {} entries", stats.frames_scanned);
println!(
" Term hits: {} ({:.1}%)",
stats.term_filter_hits,
stats.term_filter_hits as f64 / stats.frames_scanned as f64 * 100.0
);
println!(
" SimHash hits: {} ({:.1}%)",
stats.simhash_hits,
stats.simhash_hits as f64 / stats.frames_scanned as f64 * 100.0
);
println!(" Candidates: {}", stats.candidates_returned);
println!(
" Time: {} µs ({:.3} ms)",
stats.scan_us,
stats.scan_us as f64 / 1000.0
);
println!(
" Rate: {:.0} docs/sec",
stats.frames_scanned as f64 / (stats.scan_us as f64 / 1_000_000.0)
);
assert!(
stats.scan_us < 10_000,
"Sketch scan should complete in <10ms for 1000 docs"
);
assert!(
candidates.len() <= stats.simhash_hits,
"Candidates should be <= simhash hits"
);
}
}