1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
//! Overlay: in-memory view of dirty files for incremental updates.
//!
//! The overlay provides read-your-writes freshness with atomic batch commits
//! and snapshot isolation. Pending edits are invisible until `commit_batch()`.
//!
//! Design: single merged query view (research.md section 7). Each
//! `commit_batch()` incrementally rebuilds the overlay, reusing docs from
//! the previous generation for unchanged files and reading only the delta
//! from disk.
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::tokenizer::build_all;
use crate::IndexError;
/// Kind of file change buffered by `notify_change` / `notify_delete`.
#[derive(Debug, Clone)]
pub enum EditKind {
/// File was added or modified (content changed).
Changed,
/// File was removed from the repository.
Deleted,
}
/// A buffered file edit not yet committed to the index snapshot.
#[derive(Debug, Clone)]
pub struct FileEdit {
/// Repository-relative path of the changed file.
pub path: PathBuf,
/// Nature of the change.
pub kind: EditKind,
}
/// A dirty file tracked by the overlay with its current content and grams.
#[derive(Debug, Clone)]
pub struct OverlayDoc {
/// Overlay-space doc_id (disjoint from base segment range).
pub doc_id: u32,
/// Repository-relative path.
pub path: PathBuf,
/// Current file content (kept for verification during search).
/// Arc-shared to avoid cloning between snapshot generations.
pub content: Arc<[u8]>,
/// Cached gram hashes for this document. Avoids re-tokenization
/// when the doc is carried forward to the next overlay generation.
pub grams: Vec<u64>,
}
/// Single merged in-memory gram index for all dirty files.
///
/// Rebuilt from scratch on each `commit_batch()`. Query execution always
/// does two lookups: base segments + this single overlay.
#[derive(Clone)]
pub struct OverlayView {
/// Map from gram hash to sorted overlay doc_ids that contain it.
pub gram_index: HashMap<u64, Vec<u32>>,
/// All dirty files with current content.
pub docs: Vec<OverlayDoc>,
/// doc_id -> index into `docs` for O(1) lookup.
doc_id_map: HashMap<u32, usize>,
/// Next overlay-space doc_id (starts after base range).
pub next_doc_id: u32,
/// The base_doc_count at which this overlay was built.
/// Used to detect whether a segment flush occurred between commits.
pub base_doc_count: u32,
}
impl OverlayView {
/// Create an empty overlay view.
pub fn empty() -> Self {
OverlayView {
gram_index: HashMap::new(),
docs: Vec::new(),
doc_id_map: HashMap::new(),
next_doc_id: 0,
base_doc_count: 0,
}
}
/// Build an overlay from a set of dirty files.
///
/// # Caller contract: content must be pre-normalized
///
/// `dirty_files` content is stored as-is and returned by the resolver for
/// verification. Callers MUST call `normalize_encoding` before building the
/// overlay so that UTF-16 / BOM content is transcoded to UTF-8. `commit_batch`
/// in `index/mod.rs` satisfies this contract. Direct callers (e.g., tests)
/// using raw byte literals are exempt because ASCII source is already valid UTF-8.
///
/// `base_doc_count` is the total doc count across all base segments.
/// Overlay doc_ids start at `base_doc_count` to stay disjoint.
/// `dirty_files` maps repo-relative path to file content.
pub fn build(
base_doc_count: u32,
dirty_files: Vec<(PathBuf, Arc<[u8]>)>,
) -> Result<Self, IndexError> {
let mut gram_index: HashMap<u64, Vec<u32>> = HashMap::new();
let overlay_docs = dirty_files.len();
let mut docs = Vec::with_capacity(dirty_files.len());
let mut next_id = base_doc_count;
for (path, content) in dirty_files {
let doc_id = Self::next_doc_id(&mut next_id, base_doc_count, overlay_docs)?;
let grams = build_all(&content);
for &gram_hash in &grams {
gram_index.entry(gram_hash).or_default().push(doc_id);
}
docs.push(OverlayDoc {
doc_id,
path,
content,
grams,
});
}
// Sort posting lists for consistent intersection behavior.
for ids in gram_index.values_mut() {
ids.sort_unstable();
ids.dedup();
}
let doc_id_map = docs
.iter()
.enumerate()
.map(|(i, d)| (d.doc_id, i))
.collect();
Ok(OverlayView {
gram_index,
docs,
doc_id_map,
next_doc_id: next_id,
base_doc_count,
})
}
/// Build an overlay incrementally, reusing docs from the previous overlay
/// for files that haven't changed since the last commit.
///
/// Only `new_files` are read from disk and tokenized fresh. Unchanged docs
/// from `old_overlay` are carried forward with `Arc::clone` on content
/// (refcount bump, no data copy). All docs are re-indexed with new doc_ids
/// since the ID space starts at `base_doc_count`.
///
/// Fast path: when `base_doc_count == old_overlay.base_doc_count`, overlay
/// doc_ids for unchanged files are stable and the delta path is used instead
/// of a full rebuild.
pub fn build_incremental(
base_doc_count: u32,
old_overlay: &OverlayView,
new_files: Vec<(PathBuf, Arc<[u8]>)>,
newly_changed: &HashSet<PathBuf>,
removed_paths: &HashSet<PathBuf>,
) -> Result<Self, IndexError> {
// Fast path: base has not grown since the last commit.
// Overlay doc_ids for unchanged files are stable; use delta update.
if base_doc_count == old_overlay.base_doc_count {
return Self::build_incremental_delta(
base_doc_count,
old_overlay,
new_files,
newly_changed,
removed_paths,
);
}
let mut gram_index: HashMap<u64, Vec<u32>> = HashMap::new();
let overlay_docs =
(old_overlay.docs.len() + new_files.len()).saturating_sub(newly_changed.len());
let mut docs = Vec::new();
let mut next_id = base_doc_count;
// Reuse old overlay docs that are unchanged.
for old_doc in &old_overlay.docs {
if removed_paths.contains(&old_doc.path) {
continue;
}
if newly_changed.contains(&old_doc.path) {
continue; // replaced by new version below
}
let doc_id = Self::next_doc_id(&mut next_id, base_doc_count, overlay_docs)?;
// Reuse cached grams instead of re-tokenizing.
for &gram_hash in &old_doc.grams {
gram_index.entry(gram_hash).or_default().push(doc_id);
}
docs.push(OverlayDoc {
doc_id,
path: old_doc.path.clone(),
content: Arc::clone(&old_doc.content),
grams: old_doc.grams.clone(),
});
}
// Add newly changed/added files (freshly read from disk).
for (path, content) in new_files {
let doc_id = Self::next_doc_id(&mut next_id, base_doc_count, overlay_docs)?;
let grams = build_all(&content);
for &gram_hash in &grams {
gram_index.entry(gram_hash).or_default().push(doc_id);
}
docs.push(OverlayDoc {
doc_id,
path,
content,
grams,
});
}
for ids in gram_index.values_mut() {
ids.sort_unstable();
ids.dedup();
}
let doc_id_map = docs
.iter()
.enumerate()
.map(|(i, d)| (d.doc_id, i))
.collect();
Ok(OverlayView {
gram_index,
docs,
doc_id_map,
next_doc_id: next_id,
base_doc_count,
})
}
/// Allocate the next overlay doc_id, advancing `next_id` by 1.
#[inline]
fn next_doc_id(
next_id: &mut u32,
base_doc_count: u32,
overlay_docs: usize,
) -> Result<u32, IndexError> {
let id = *next_id;
*next_id = next_id.checked_add(1).ok_or(IndexError::DocIdOverflow {
base_doc_count,
overlay_docs,
})?;
Ok(id)
}
/// Delta path: base_doc_count is unchanged, so overlay doc_ids for unchanged
/// files are stable. Clone the old gram_index, remove stale entries for
/// changed/deleted files using their cached grams, append new doc_ids for
/// new/changed files. New doc_ids are always > all existing ids so posting
/// lists remain sorted after push.
fn build_incremental_delta(
base_doc_count: u32,
old_overlay: &OverlayView,
new_files: Vec<(PathBuf, Arc<[u8]>)>,
newly_changed: &HashSet<PathBuf>,
removed_paths: &HashSet<PathBuf>,
) -> Result<Self, IndexError> {
// Clone old gram_index; remove stale entries for changed/deleted files.
//
// Cost: O(gram_index.len()), NOT O(changed files). For an overlay with
// 1000 dirty files x ~120 grams = 120K entries, this is ~1 MB of
// HashMap clone per commit_batch call regardless of how many files changed.
// The delta path was optimised for the single-file edit case: the clone
// is fast in practice (< 5 ms for typical overlay sizes), but grows
// linearly with overlay size and eventually dominates the delta advantage.
//
// In-place mutation is NOT possible: the old OverlayView is embedded in
// Arc<IndexSnapshot> (snapshot.rs), held by ArcSwap + in-flight readers.
// Arc::get_mut requires refcount == 1, which never holds during normal
// operation. Wrapping gram_index in Arc<HashMap> doesn't help either:
// the old overlay still holds its Arc ref through the snapshot, so
// Arc::make_mut would clone anyway.
//
// Note: the .cloned() carry-forward of unchanged OverlayDocs (below)
// has comparable allocation cost (clones grams: Vec<u64> and path per
// doc). Both costs should be addressed together.
//
// Mitigation (v2): use a persistent/CoW map (e.g., `im::HashMap`) so
// only the changed entries are copied, or HashMap<u64, Arc<Vec<u32>>>
// to share posting list data. See ARCHITECTURE.md "Overlay compaction".
let mut gram_index = old_overlay.gram_index.clone();
let overlay_docs =
(old_overlay.docs.len() + new_files.len()).saturating_sub(newly_changed.len());
for old_doc in &old_overlay.docs {
if removed_paths.contains(&old_doc.path) || newly_changed.contains(&old_doc.path) {
for &gram_hash in &old_doc.grams {
if let Some(list) = gram_index.get_mut(&gram_hash) {
list.retain(|&id| id != old_doc.doc_id);
}
}
}
}
// Drop posting lists that became empty after removal.
gram_index.retain(|_, list| !list.is_empty());
// Carry forward unchanged docs with their existing (stable) doc_ids.
// Cost: .cloned() deep-copies grams (Vec<u64>, ~120 elements) and path
// (PathBuf) per unchanged doc. Content is Arc<[u8]> (refcount bump only).
// For single-file edits in a 1000-doc overlay, this is ~999 Vec clones,
// comparable to the gram_index clone above.
let mut docs: Vec<OverlayDoc> = old_overlay
.docs
.iter()
.filter(|d| !removed_paths.contains(&d.path) && !newly_changed.contains(&d.path))
.cloned()
.collect();
// New/changed files get fresh doc_ids starting from next_doc_id.
// Since next_doc_id > all existing doc_ids, push keeps posting lists sorted.
let mut next_id = old_overlay.next_doc_id;
for (path, content) in new_files {
let doc_id = Self::next_doc_id(&mut next_id, base_doc_count, overlay_docs)?;
let grams = build_all(&content);
for &gram_hash in &grams {
gram_index.entry(gram_hash).or_default().push(doc_id);
}
docs.push(OverlayDoc {
doc_id,
path,
content,
grams,
});
}
// Sorted-order invariant: new doc_ids > all existing, so push() keeps lists sorted.
// This is structurally guaranteed by monotonic next_doc_id, but we enforce it
// defensively in release builds: if a future refactor breaks monotonicity,
// unsorted posting lists cause silent intersection misses.
for ids in gram_index.values_mut() {
if ids.windows(2).any(|w| w[0] >= w[1]) {
ids.sort_unstable();
ids.dedup();
}
}
let doc_id_map = docs
.iter()
.enumerate()
.map(|(i, d)| (d.doc_id, i))
.collect();
Ok(OverlayView {
gram_index,
docs,
doc_id_map,
next_doc_id: next_id,
base_doc_count,
})
}
/// Look up an overlay doc by its global doc_id. O(1) via HashMap.
pub fn get_doc(&self, global_id: u32) -> Option<&OverlayDoc> {
self.doc_id_map.get(&global_id).map(|&idx| &self.docs[idx])
}
/// Look up an overlay doc by path.
pub fn get_doc_by_path(&self, path: &Path) -> Option<&OverlayDoc> {
self.docs.iter().find(|d| d.path == path)
}
}
// Re-export pending types so callers using `crate::index::overlay::*` continue to compile.
pub use crate::index::pending::{compute_delete_set, PendingEdits, TakeResult};
#[cfg(test)]
#[path = "overlay_tests.rs"]
mod tests;