1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
//! Manifest: tracks active segments and overlay state.
//!
//! Persisted as `manifest.json` using atomic write-then-rename.
//! Reads on startup load segment metadata; GC removes orphan files.
// io::Error::new(ErrorKind::Other, ...) is used instead of io::Error::other()
// for Rust < 1.74 compatibility (Windows CI constraint).
#![allow(clippy::io_other_error)]
use std::io;
use std::io::Write;
use std::path::Path;
use std::time::{SystemTime, UNIX_EPOCH};
use serde::{Deserialize, Serialize};
use crate::index::segment::SegmentMeta;
use crate::IndexError;
/// Reference to an active base segment stored in the manifest.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SegmentRef {
/// UUID string that uniquely identifies this segment across rebuilds.
pub segment_id: String,
/// First global doc_id stored in this segment, if explicitly recorded.
#[serde(default)]
pub base_doc_id: Option<u32>,
/// Legacy combined filename (`<uuid>.seg`). Empty for v3 segments.
#[serde(default)]
pub filename: String,
/// Dictionary filename (`<uuid>.dict`) for v3 segments. Empty for v2.
#[serde(default)]
pub dict_filename: String,
/// Postings filename (`<uuid>.post`) for v3 segments. Empty for v2.
#[serde(default)]
pub post_filename: String,
/// Number of documents (files) indexed in this segment.
pub doc_count: u32,
/// Number of distinct n-gram hashes stored in this segment's dictionary.
pub gram_count: u32,
}
impl From<SegmentMeta> for SegmentRef {
fn from(m: SegmentMeta) -> Self {
SegmentRef {
segment_id: m.segment_id.to_string(),
base_doc_id: None,
filename: m.filename,
dict_filename: m.dict_filename,
post_filename: m.post_filename,
doc_count: m.doc_count,
gram_count: m.gram_count,
}
}
}
/// On-disk manifest describing the current index state.
///
/// Persisted as `manifest.json` via atomic write-then-rename so readers never
/// see a partially-written file. The `opstamp` field is reserved for
/// ordering concurrent writers in future phases.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Manifest {
/// Format version; must equal `FORMAT_VERSION` (currently 1).
pub version: u32,
/// Git commit SHA the base segments were built from, if recorded.
pub base_commit: Option<String>,
/// All active base segments; GC removes any `.seg` files not listed here.
pub segments: Vec<SegmentRef>,
/// Generation counter for overlay crash recovery. Reserved for future use;
/// always written as 0 by the current implementation. On-disk generation
/// files described in research.md §12 are not yet written (deferred to a
/// later milestone). Do not remove: older manifest files include this field
/// and removing it would silently corrupt them during deserialization.
pub overlay_gen: u64,
/// Filename of the on-disk overlay gram index, if present.
pub overlay_file: Option<String>,
/// Filename of the on-disk overlay deletes list, if present.
pub overlay_deletes_file: Option<String>,
/// Total files successfully indexed across all segments.
pub total_files_indexed: u32,
/// Unix timestamp (seconds) when this manifest was first created.
pub created_at: u64,
/// Operation stamp for future concurrent-write ordering (currently 0).
pub opstamp: u64,
/// Calibrated index vs. full-scan crossover fraction (0.0–1.0).
/// `None` means this index was built without calibration; callers should
/// fall back to the compile-time default of 0.10.
#[serde(default)]
pub scan_threshold_fraction: Option<f64>,
/// xxh64 checksum of this manifest serialized with this field omitted.
/// `None` means the manifest predates checksum support.
#[serde(skip_serializing_if = "Option::is_none", default)]
pub checksum: Option<u64>,
}
impl Manifest {
const FILENAME: &'static str = "manifest.json";
const FORMAT_VERSION: u32 = 1;
/// Create a new manifest from a list of completed segments.
pub fn new(segments: Vec<SegmentRef>, total_files_indexed: u32) -> Self {
let created_at = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
Manifest {
version: Self::FORMAT_VERSION,
base_commit: None,
segments,
overlay_gen: 0,
overlay_file: None,
overlay_deletes_file: None,
total_files_indexed,
created_at,
opstamp: 0,
scan_threshold_fraction: None, // populated by Index::build() after calibration
checksum: None,
}
}
/// Maximum manifest file size (10 MB). A normal manifest for a 100K-file
/// repo is under 1 MB. Anything larger is likely corrupt or adversarial.
const MAX_MANIFEST_SIZE: u64 = 10 * 1024 * 1024;
/// Load the manifest from `index_dir/manifest.json`.
///
/// Security audit (deserialization): `serde_json::from_str` performs pure data
/// parsing with no code execution, no gadget chains, and no polymorphic type
/// resolution. The 10 MB size cap (`MAX_MANIFEST_SIZE`) bounds memory before
/// parsing begins. Post-parse, `MAX_TOTAL_DOCS` (50M, in `index/mod.rs`)
/// caps the doc count derived from segment refs, preventing unbounded
/// allocations from a crafted manifest with inflated `doc_count` values.
pub fn load(index_dir: &Path) -> Result<Self, IndexError> {
let path = index_dir.join(Self::FILENAME);
let meta = std::fs::metadata(&path)?;
if meta.len() > Self::MAX_MANIFEST_SIZE {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"manifest too large ({} bytes, max {})",
meta.len(),
Self::MAX_MANIFEST_SIZE
),
)
.into());
}
let data = std::fs::read_to_string(&path)?;
let manifest: Self = serde_json::from_str(&data)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
if let Some(stored_checksum) = manifest.checksum {
let mut canonical_manifest = manifest.clone();
canonical_manifest.checksum = None;
let canonical_json = serde_json::to_string(&canonical_manifest)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let computed_checksum = xxhash_rust::xxh64::xxh64(canonical_json.as_bytes(), 0);
if computed_checksum != stored_checksum {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"manifest checksum mismatch: stored {stored_checksum:#x}, computed {computed_checksum:#x}"
),
)
.into());
}
} else {
eprintln!(
"syntext: warning: manifest at '{}' has no checksum; \
re-run `st index` to add one",
path.display(),
);
}
// Validate that each segment_id is a well-formed UUID. The field is not
// currently used in filesystem paths, but future code (GC, logging,
// metrics) could join it to a path. Reject non-UUID values proactively.
for seg in &manifest.segments {
if uuid::Uuid::parse_str(&seg.segment_id).is_err() {
return Err(IndexError::CorruptIndex(format!(
"segment_id is not a valid UUID: {:?}",
seg.segment_id
)));
}
}
Ok(manifest)
}
/// Atomically persist the manifest to `index_dir/manifest.json`.
pub fn save(&self, index_dir: &Path) -> io::Result<()> {
// Use a random UUID for the temporary file to prevent TOCTOU (Time-of-Check to Time-of-Use)
// symlink attacks where an attacker could pre-create `manifest.json.tmp` as a symlink
// leading to arbitrary file overwrite.
let tmp = index_dir.join(format!("manifest-{}.tmp", uuid::Uuid::new_v4()));
let final_path = index_dir.join(Self::FILENAME);
let mut canonical_manifest = self.clone();
canonical_manifest.checksum = None;
let canonical_json = serde_json::to_string(&canonical_manifest)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let checksum = xxhash_rust::xxh64::xxh64(canonical_json.as_bytes(), 0);
let mut persisted_manifest = self.clone();
persisted_manifest.checksum = Some(checksum);
let json = serde_json::to_string_pretty(&persisted_manifest)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
{
let mut file = std::fs::File::create(&tmp)?;
file.write_all(json.as_bytes())?;
file.sync_all()?;
}
std::fs::rename(&tmp, &final_path)?;
#[cfg(not(windows))]
std::fs::File::open(index_dir)?.sync_all()?;
Ok(())
}
/// Remove segment files that exist on disk but are not referenced by this
/// manifest. Called during build and after compaction to clean up orphans.
///
/// **Unix safety:** `MmapSegment` holds an open `File` handle (`_file` field).
/// On Linux and macOS, `unlink(2)` removes the directory entry but the inode
/// (and mmap) remain valid until all file descriptors are closed. A reader
/// that opened a segment before GC runs will continue to operate correctly.
///
/// **Windows caveat:** Windows does not allow deleting a file that is open
/// by another process/handle. `fs::remove_file` will return an error for
/// any segment still mmap'd by an open `Index`. This is a known v1 limitation.
pub fn gc_orphan_segments(&self, index_dir: &Path) -> io::Result<()> {
let mut known: std::collections::HashSet<&str> = std::collections::HashSet::new();
for s in &self.segments {
if !s.filename.is_empty() {
known.insert(s.filename.as_str());
}
if !s.dict_filename.is_empty() {
known.insert(s.dict_filename.as_str());
}
if !s.post_filename.is_empty() {
known.insert(s.post_filename.as_str());
}
}
for entry in std::fs::read_dir(index_dir)? {
let entry = entry?;
let name = entry.file_name();
let name_str = name.to_string_lossy();
let is_segment_file = name_str.ends_with(".seg")
|| name_str.ends_with(".dict")
|| name_str.ends_with(".post");
if is_segment_file && !known.contains(name_str.as_ref()) {
if let Err(e) = std::fs::remove_file(entry.path()) {
eprintln!("syntext: gc: could not remove {}: {e}", name_str);
}
}
if name_str.ends_with(".tmp") {
let _ = std::fs::remove_file(entry.path());
}
}
Ok(())
}
/// Total number of documents across all segments.
pub fn total_docs(&self) -> u32 {
self.segments.iter().map(|s| s.doc_count).sum()
}
}
#[cfg(test)]
#[path = "manifest_tests.rs"]
mod tests;