Skip to main content

ix/
builder.rs

1//! Index builder — the complete pipeline from files to .ix shard.
2//!
3//! Phase 1: Discovery (walk directory, respect .gitignore)
4//! Phase 2: Scan (mmap, check binary, extract trigrams, bloom filter)
5//! Phase 3: Serialize (write sections, compute CRCs, atomic rename)
6
7use crate::bloom::BloomFilter;
8use crate::decompress::maybe_decompress;
9use crate::error::Result;
10use crate::format::*;
11use crate::posting::{PostingEntry, PostingList};
12use crate::trigram::{Extractor, Trigram};
13use ignore::WalkBuilder;
14use memmap2::Mmap;
15use std::collections::{BinaryHeap, HashMap};
16use std::fs::{self, File};
17use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
18use std::path::{Path, PathBuf};
19use std::time::{Instant, SystemTime, UNIX_EPOCH};
20use libc;
21use llmosafe::{
22    ResourceGuard, sift_perceptions, Synapse, WorkingMemory,
23    llmosafe_kernel::{KernelError, ValidatedSynapse}
24};
25
26pub struct Builder {
27    root: PathBuf,
28    ix_dir: PathBuf,
29    file_count: u32,
30    
31    // O(1) memory streaming writers for temporary file table and blooms
32    files_writer: BufWriter<File>,
33    blooms_writer: BufWriter<File>,
34    strings_writer: BufWriter<File>,
35    
36    // Postings batching for external sort
37    postings: HashMap<Trigram, Vec<PostingEntry>>,
38    postings_count: usize,
39    temp_runs: Vec<PathBuf>,
40    
41    extractor: Extractor,
42    stats: BuildStats,
43    decompress: bool,
44    resource_guard: Option<ResourceGuard>,
45    cognitive_memory: WorkingMemory<128>,
46    dead_ends: Vec<PathBuf>,
47}
48
49#[derive(Default, Debug)]
50pub struct BuildStats {
51    pub files_scanned: u64,
52    pub files_skipped_binary: u64,
53    pub files_skipped_size: u64,
54    pub bytes_scanned: u64,
55    pub unique_trigrams: u64,
56}
57
58struct RunIterator {
59    file: BufReader<File>,
60}
61
62impl RunIterator {
63    fn new(path: &Path) -> Result<Self> {
64        let f = File::open(path)?;
65        Ok(Self {
66            file: BufReader::new(f),
67        })
68    }
69
70    fn next_trigram(&mut self) -> Result<Option<(Trigram, Vec<PostingEntry>)>> {
71        let mut tri_buf = [0u8; 4];
72        if let Err(e) = self.file.read_exact(&mut tri_buf) {
73            if e.kind() == std::io::ErrorKind::UnexpectedEof {
74                return Ok(None);
75            }
76            return Err(e.into());
77        }
78        let tri = u32::from_le_bytes(tri_buf);
79
80        let mut len_buf = [0u8; 4];
81        self.file.read_exact(&mut len_buf)?;
82        let entries_len = u32::from_le_bytes(len_buf) as usize;
83
84        let mut entries = Vec::with_capacity(entries_len);
85        for _ in 0..entries_len {
86            self.file.read_exact(&mut len_buf)?;
87            let file_id = u32::from_le_bytes(len_buf);
88
89            self.file.read_exact(&mut len_buf)?;
90            let offsets_len = u32::from_le_bytes(len_buf) as usize;
91
92            let mut offsets = Vec::with_capacity(offsets_len);
93            for _ in 0..offsets_len {
94                self.file.read_exact(&mut len_buf)?;
95                offsets.push(u32::from_le_bytes(len_buf));
96            }
97            entries.push(PostingEntry { file_id, offsets });
98        }
99
100        Ok(Some((tri, entries)))
101    }
102}
103
104#[derive(Eq, PartialEq)]
105struct MergeItem {
106    tri: Trigram,
107    run_idx: usize,
108}
109
110impl PartialOrd for MergeItem {
111    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
112        Some(self.cmp(other))
113    }
114}
115
116impl Ord for MergeItem {
117    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
118        other.tri.cmp(&self.tri) // Min-heap
119    }
120}
121
122impl Builder {
123    pub fn new(root: &Path) -> Result<Self> {
124        let ix_dir = root.join(".ix");
125        fs::create_dir_all(&ix_dir)?;
126        
127        let files_tmp = ix_dir.join("shard.ix.tmp.files");
128        let blooms_tmp = ix_dir.join("shard.ix.tmp.blooms");
129        let strings_tmp = ix_dir.join("shard.ix.tmp.strings");
130        
131        let files_writer = BufWriter::new(File::create(&files_tmp)?);
132        let blooms_writer = BufWriter::new(File::create(&blooms_tmp)?);
133        let mut strings_writer = BufWriter::new(File::create(&strings_tmp)?);
134        
135        strings_writer.write_all(&1u32.to_le_bytes())?;
136        strings_writer.write_all(&0u16.to_le_bytes())?;
137        strings_writer.write_all(&0u16.to_le_bytes())?;
138        strings_writer.write_all(&[0u8; 2])?;
139
140        Ok(Self {
141            root: root.to_owned(),
142            ix_dir,
143            file_count: 0,
144            files_writer,
145            blooms_writer,
146            strings_writer,
147            postings: HashMap::new(),
148            postings_count: 0,
149            temp_runs: Vec::new(),
150            extractor: Extractor::new(),
151            stats: BuildStats::default(),
152            decompress: false,
153            resource_guard: None,
154            cognitive_memory: WorkingMemory::new(1000), // Standard surprise threshold
155            dead_ends: Vec::new(),
156        })
157    }
158
159    pub fn with_resource_guard(mut self, guard: ResourceGuard) -> Self {
160        self.resource_guard = Some(guard);
161        self
162    }
163
164    pub fn set_decompress(&mut self, decompress: bool) {
165        self.decompress = decompress;
166    }
167
168    fn flush_run(&mut self) -> Result<()> {
169        if self.postings.is_empty() {
170            return Ok(());
171        }
172        let old_postings = std::mem::take(&mut self.postings);
173        let mut sorted: Vec<_> = old_postings.into_iter().collect();
174        sorted.sort_unstable_by_key(|(t, _)| *t);
175
176        let run_path = self.ix_dir.join(format!("shard.ix.run.{}", self.temp_runs.len()));
177        let mut f = BufWriter::new(File::create(&run_path)?);
178
179        for (tri, entries) in sorted {
180            f.write_all(&tri.to_le_bytes())?;
181            f.write_all(&(entries.len() as u32).to_le_bytes())?;
182            for entry in entries {
183                f.write_all(&entry.file_id.to_le_bytes())?;
184                f.write_all(&(entry.offsets.len() as u32).to_le_bytes())?;
185                for off in entry.offsets {
186                    f.write_all(&off.to_le_bytes())?;
187                }
188            }
189        }
190        f.flush()?;
191        
192        self.temp_runs.push(run_path);
193        self.postings_count = 0;
194        Ok(())
195    }
196
197    pub fn build(&mut self) -> Result<PathBuf> {
198        let start = Instant::now();
199        let root = self.root.clone();
200        
201        // LLMOSafe Formal Law: Sensitive filesystem traversal (Root)
202        if root.to_string_lossy() == "/" {
203            tracing::warn!("LLMOSafe Advisory: Indexing root filesystem. Ensure adequate resource guards are in place.");
204        }
205
206        let walker = WalkBuilder::new(&root)
207            .hidden(false)
208            .git_ignore(true)
209            .require_git(false)
210            .add_custom_ignore_filename(".ixignore")
211            .filter_entry(move |entry| {
212                let path = entry.path();
213                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
214                
215                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
216                    && (name == "lost+found" || name == ".git" || name == "node_modules" || 
217                       name == "target" || name == "__pycache__" || name == ".tox" || 
218                       name == ".venv" || name == "venv" || name == ".ix") 
219                {
220                    return false;
221                }
222
223                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
224                    if let Ok(metadata) = entry.metadata()
225                        && metadata.len() > 10 * 1024 * 1024
226                    {
227                        return false;
228                    }
229                    if name == "Cargo.lock" || name == "package-lock.json" || name == "pnpm-lock.yaml" || 
230                       name == "shard.ix" || name == "shard.ix.tmp" || name.starts_with("shard.ix.") 
231                    {
232                        return false;
233                    }
234                }
235
236                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
237                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
238                    match ext {
239                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
240                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
241                        "zip" | "7z" | "rar" |
242                        "sqlite" | "db" | "bin" => return false,
243                        _ => {}
244                    }
245                    if name.ends_with(".tar.gz") {
246                        return false;
247                    }
248                }
249                true
250            })
251            .build();
252
253        let mut files_processed = 0u64;
254        for entry_res in walker {
255            let entry = match entry_res {
256                Ok(e) => e,
257                Err(e) => {
258                    // Handle KernelError::BacktrackSignaled (-7) during the walk
259                    let backtrack_path = match &e {
260                        ignore::Error::Io(io_err) if io_err.raw_os_error() == Some(-7) => Some(None),
261                        ignore::Error::WithPath { path, err } => {
262                            if let ignore::Error::Io(io_err) = err.as_ref() {
263                                if io_err.raw_os_error() == Some(-7) {
264                                    Some(Some(path.clone()))
265                                } else {
266                                    None
267                                }
268                            } else {
269                                None
270                            }
271                        }
272                        _ => None,
273                    };
274
275                    if let Some(path_opt) = backtrack_path {
276                        tracing::warn!("Immune Memory Triggered: Skipping path due to backtrack signal.");
277                        if let Some(path) = path_opt {
278                            self.dead_ends.push(path);
279                        }
280                    }
281                    continue;
282                }
283            };
284
285            if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
286                self.process_file(entry.path().to_owned())?;
287                files_processed += 1;
288
289                // Resource Guard Check: check every 250 files to prevent OOM
290                if files_processed.is_multiple_of(250) {
291                    if let Some(guard) = &self.resource_guard {
292                        if guard.check().map(|_s: ::llmosafe::Synapse| ()).is_err() {
293                            let _err = guard.check().unwrap_err();
294                            eprintln!("ixd: memory ceiling reached... flushing intermediate chunk ({} files processed)", files_processed);
295                            self.flush_run()?;
296                            continue;
297                        }
298                    } else {
299                        // Fallback to manual RSS limit if no formal guard provided
300                        if let Ok(rss) = Self::current_rss_bytes() 
301                            && rss > 512 * 1024 * 1024 
302                        {
303                            eprintln!("ixd: RSS ceiling reached ({} MB) after {} files — flushing intermediate chunk",
304                                rss / 1024 / 1024, files_processed);
305                            self.flush_run()?;
306                            continue;
307                        }
308                    }
309                }
310            }
311        }
312
313        let output_path = self.serialize()?;
314        tracing::info!("Build completed in {:?}: {:?}", start.elapsed(), self.stats);
315        Ok(output_path)
316    }
317
318    pub fn update(&mut self, _changed_files: &[PathBuf]) -> Result<PathBuf> {
319        self.build()
320    }
321
322    pub fn files_len(&self) -> usize {
323        self.file_count as usize
324    }
325
326    pub fn trigrams_len(&self) -> usize {
327        self.stats.unique_trigrams as usize
328    }
329
330    /// Returns current process RSS in bytes by reading /proc/self/status.
331    fn current_rss_bytes() -> std::io::Result<u64> {
332        let status = std::fs::read_to_string("/proc/self/status")?;
333        for line in status.lines() {
334            if let Some(rest) = line.strip_prefix("VmRSS:") {
335                let kb: u64 = rest.split_whitespace().next()
336                    .and_then(|s| s.parse().ok())
337                    .unwrap_or(0);
338                return Ok(kb * 1024);
339            }
340        }
341        Ok(0)
342    }
343
344    /// Returns free bytes available on the filesystem containing `path`.
345    fn free_bytes_at(path: &Path) -> std::io::Result<u64> {
346        use std::os::unix::ffi::OsStrExt;
347        let path_c = std::ffi::CString::new(path.as_os_str().as_bytes())
348            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
349        let mut stat: libc::statvfs = unsafe { std::mem::zeroed() };
350        let ret = unsafe { libc::statvfs(path_c.as_ptr(), &mut stat) };
351        if ret != 0 {
352            return Err(std::io::Error::last_os_error());
353        }
354        Ok(stat.f_bavail * stat.f_frsize)
355    }
356
357    fn process_file(&mut self, path: PathBuf) -> Result<bool> {
358        // TOCTOU guard: file may have been deleted between walk and open
359        let metadata = match fs::metadata(&path) {
360            Ok(m) => m,
361            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(false),
362            Err(e) => return Err(e.into()),
363        };
364        let size = metadata.len();
365        let mtime = metadata.modified()?.duration_since(UNIX_EPOCH).map(|d| d.as_nanos() as u64).unwrap_or(0);
366
367        if size > 10 * 1024 * 1024 {
368            self.stats.files_skipped_size += 1;
369            return Ok(false);
370        }
371
372        let file = match File::open(&path) {
373            Ok(f) => f,
374            Err(e) if e.kind() == std::io::ErrorKind::NotFound
375                   || e.kind() == std::io::ErrorKind::PermissionDenied => return Ok(false),
376            Err(e) => return Err(e.into()),
377        };
378        let mmap = unsafe { Mmap::map(&file)? };
379
380        let raw_data = if self.decompress {
381            if let Some(mut reader) = maybe_decompress(&path, &mmap)? {
382                let mut buf = Vec::new();
383                use std::io::Read;
384                reader.by_ref().take(10 * 1024 * 1024).read_to_end(&mut buf)?;
385                std::borrow::Cow::Owned(buf)
386            } else {
387                std::borrow::Cow::Borrowed(&mmap[..])
388            }
389        } else {
390            std::borrow::Cow::Borrowed(&mmap[..])
391        };
392
393        let data = &raw_data[..];
394        if is_binary(data) {
395            self.stats.files_skipped_binary += 1;
396            return Ok(false);
397        }
398
399        // LLMOSafe Tier 3: Perceptual Sifting (Cognitive Layer)
400        // Evaluate file utility and bias (Halo signal)
401        let sample_len = data.len().min(2048);
402        let sample = String::from_utf8_lossy(&data[..sample_len]);
403        let objective = "High-signal source code for semantic indexing";
404        let sifted = sift_perceptions(&[sample.as_ref()], objective);
405        
406        if let Err(e) = self.cognitive_memory.update(sifted) {
407            tracing::warn!("LLMOSafe Cognitive Guard rejection for {}: {:?}", path.display(), e);
408            // Skip files that don't pass the safety/utility check (e.g., high bias/halo or high surprise)
409            return Ok(false);
410        }
411
412        let content_hash = xxhash_rust::xxh64::xxh64(data, 0);
413        let pairs = self.extractor.extract_with_offsets(data);
414        
415        let file_id = self.file_count;
416        self.file_count += 1;
417
418        let path_str = path.to_string_lossy();
419        let path_bytes = path_str.as_bytes();
420        let path_off = (self.strings_writer.stream_position()?) as u32;
421        let path_len = path_bytes.len() as u16;
422        
423        self.strings_writer.write_all(&0u16.to_le_bytes())?; 
424        self.strings_writer.write_all(&path_len.to_le_bytes())?;
425        self.strings_writer.write_all(path_bytes)?;
426
427        let mut bloom = BloomFilter::new(256, 5);
428        let mut trigram_count = 0u32;
429
430        let mut i = 0;
431        while i < pairs.len() && trigram_count < 20_000 {
432            let tri = pairs[i].0;
433            let mut j = i + 1;
434            while j < pairs.len() && pairs[j].0 == tri {
435                j += 1;
436            }
437            
438            let take_count = (j - i).min(10_000);
439            let offsets: Vec<u32> = pairs[i..i + take_count].iter().map(|p| p.1).collect();
440            
441            bloom.insert(tri);
442            self.postings.entry(tri).or_default().push(PostingEntry {
443                file_id,
444                offsets,
445            });
446            self.postings_count += take_count + 8;
447            
448            trigram_count += 1;
449            i = j;
450        }
451
452        bloom.serialize(&mut self.blooms_writer)?;
453
454        let bloom_offset = file_id * 260;
455        self.files_writer.write_all(&file_id.to_le_bytes())?;
456        self.files_writer.write_all(&path_off.to_le_bytes())?;
457        self.files_writer.write_all(&path_len.to_le_bytes())?;
458        self.files_writer.write_all(&[FileStatus::Fresh as u8])?;
459        self.files_writer.write_all(&[0u8])?; 
460        self.files_writer.write_all(&mtime.to_le_bytes())?;
461        self.files_writer.write_all(&size.to_le_bytes())?;
462        self.files_writer.write_all(&content_hash.to_le_bytes())?;
463        self.files_writer.write_all(&trigram_count.to_le_bytes())?;
464        self.files_writer.write_all(&bloom_offset.to_le_bytes())?; 
465        self.files_writer.write_all(&[0u8; 4])?; 
466
467        self.stats.files_scanned += 1;
468        self.stats.bytes_scanned += size;
469
470        // Flush every 500k entries (~8MB peak RAM) to prevent unbounded HashMap growth.
471        // This was the RAM DDOS root cause in v0.1.1 — threshold was 5M (far too high).
472        if self.postings_count >= 500_000 {
473            self.flush_run()?;
474        }
475
476        Ok(true)
477    }
478
479    fn serialize(&mut self) -> Result<PathBuf> {
480        // Disk space guard: abort if < 100MB free to avoid partial shard writes
481        if let Ok(free) = Self::free_bytes_at(&self.ix_dir) {
482            const MIN_FREE: u64 = 100 * 1024 * 1024; // 100 MB
483            if free < MIN_FREE {
484                return Err(crate::error::Error::Io(std::io::Error::other(
485                    format!(
486                        "insufficient disk space: {} MB free, need ≥100 MB (path: {})",
487                        free / 1024 / 1024,
488                        self.ix_dir.display()
489                    ),
490                )));
491            }
492        }
493        self.flush_run()?;
494        
495        self.files_writer.flush()?;
496        self.blooms_writer.flush()?;
497        self.strings_writer.flush()?;
498
499        // Hierarchical Merge to stay under ulimit
500        while self.temp_runs.len() > 128 {
501            let mut next_generation = Vec::new();
502            for chunk in self.temp_runs.chunks(128) {
503                let out_path = self.ix_dir.join(format!("shard.ix.merged.{}.{}", next_generation.len(), SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_micros()));
504                self.merge_to_run(chunk, &out_path)?;
505                next_generation.push(out_path);
506                for p in chunk { let _ = fs::remove_file(p); }
507            }
508            self.temp_runs = next_generation;
509        }
510
511        let tmp_path = self.ix_dir.join("shard.ix.tmp");
512        let final_path = self.ix_dir.join("shard.ix");
513        let temp_trigrams_path = self.ix_dir.join("shard.ix.tmp.trigrams");
514
515        let mut f = BufWriter::new(File::create(&tmp_path)?);
516        f.write_all(&[0u8; HEADER_SIZE])?;
517
518        let file_table_offset = self.align_to_8(&mut f)?;
519        let mut files_reader = File::open(self.ix_dir.join("shard.ix.tmp.files"))?;
520        std::io::copy(&mut files_reader, &mut f)?;
521        let file_table_size = f.stream_position()? - file_table_offset;
522
523        self.align_to_8(&mut f)?;
524        let posting_data_offset = f.stream_position()?;
525
526        let mut trigram_table_writer = BufWriter::new(File::create(&temp_trigrams_path)?);
527        let mut global_trigram_count = 0u32;
528
529        let mut runs = Vec::new();
530        for path in &self.temp_runs {
531            runs.push(RunIterator::new(path)?);
532        }
533
534        let mut heap = BinaryHeap::new();
535        let mut current_items = vec![None; runs.len()];
536
537        for (i, run) in runs.iter_mut().enumerate() {
538            if let Some(item) = run.next_trigram()? {
539                heap.push(MergeItem { tri: item.0, run_idx: i });
540                current_items[i] = Some(item);
541            }
542        }
543
544        let mut current_tri: Option<Trigram> = None;
545        let mut merged_entries: Vec<PostingEntry> = Vec::new();
546
547        while let Some(MergeItem { tri, run_idx }) = heap.pop() {
548            if Some(tri) != current_tri {
549                if let Some(t) = current_tri {
550                    self.write_merged_posting(&mut f, &mut trigram_table_writer, t, posting_data_offset, &mut merged_entries)?;
551                    global_trigram_count += 1;
552                    merged_entries.clear();
553                }
554                current_tri = Some(tri);
555            }
556
557            let item = current_items[run_idx].take().unwrap();
558            merged_entries.extend(item.1);
559
560            if let Some(next_item) = runs[run_idx].next_trigram()? {
561                heap.push(MergeItem { tri: next_item.0, run_idx });
562                current_items[run_idx] = Some(next_item);
563            }
564        }
565
566        if let Some(t) = current_tri {
567            self.write_merged_posting(&mut f, &mut trigram_table_writer, t, posting_data_offset, &mut merged_entries)?;
568            global_trigram_count += 1;
569        }
570
571        self.stats.unique_trigrams = global_trigram_count as u64;
572        let posting_data_size = f.stream_position()? - posting_data_offset;
573
574        self.align_to_8(&mut f)?;
575        let trigram_table_offset = f.stream_position()?;
576        trigram_table_writer.flush()?;
577        drop(trigram_table_writer);
578        
579        let mut trigram_table_file = File::open(&temp_trigrams_path)?;
580        std::io::copy(&mut trigram_table_file, &mut f)?;
581        let trigram_table_size = f.stream_position()? - trigram_table_offset;
582
583        self.align_to_8(&mut f)?;
584        let bloom_offset = f.stream_position()?;
585        let mut blooms_reader = File::open(self.ix_dir.join("shard.ix.tmp.blooms"))?;
586        std::io::copy(&mut blooms_reader, &mut f)?;
587        let bloom_size = f.stream_position()? - bloom_offset;
588
589        self.align_to_8(&mut f)?;
590        let string_pool_offset = f.stream_position()?;
591        let mut strings_reader = File::open(self.ix_dir.join("shard.ix.tmp.strings"))?;
592        std::io::copy(&mut strings_reader, &mut f)?;
593        let string_pool_size = f.stream_position()? - string_pool_offset;
594
595        let name_index_offset = f.stream_position()?;
596        let name_index_size = 0u64;
597
598        let created_at = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_micros() as u64;
599        let mut header_bytes = [0u8; HEADER_SIZE];
600        header_bytes[0..4].copy_from_slice(&MAGIC);
601        header_bytes[0x04..0x06].copy_from_slice(&VERSION_MAJOR.to_le_bytes());
602        header_bytes[0x06..0x08].copy_from_slice(&VERSION_MINOR.to_le_bytes());
603        header_bytes[0x08..0x10].copy_from_slice(&(flags::HAS_BLOOM_FILTERS | flags::HAS_CONTENT_HASHES | flags::POSTING_LISTS_CHECKSUMMED).to_le_bytes());
604        header_bytes[0x10..0x18].copy_from_slice(&created_at.to_le_bytes());
605        header_bytes[0x18..0x20].copy_from_slice(&self.stats.bytes_scanned.to_le_bytes());
606        header_bytes[0x20..0x24].copy_from_slice(&self.file_count.to_le_bytes());
607        header_bytes[0x24..0x28].copy_from_slice(&(global_trigram_count).to_le_bytes());
608        header_bytes[0x28..0x30].copy_from_slice(&file_table_offset.to_le_bytes());
609        header_bytes[0x30..0x38].copy_from_slice(&file_table_size.to_le_bytes());
610        header_bytes[0x38..0x40].copy_from_slice(&trigram_table_offset.to_le_bytes());
611        header_bytes[0x40..0x48].copy_from_slice(&trigram_table_size.to_le_bytes());
612        header_bytes[0x48..0x50].copy_from_slice(&posting_data_offset.to_le_bytes());
613        header_bytes[0x50..0x58].copy_from_slice(&posting_data_size.to_le_bytes());
614        header_bytes[0x58..0x60].copy_from_slice(&bloom_offset.to_le_bytes());
615        header_bytes[0x60..0x68].copy_from_slice(&bloom_size.to_le_bytes());
616        header_bytes[0x68..0x70].copy_from_slice(&string_pool_offset.to_le_bytes());
617        header_bytes[0x70..0x78].copy_from_slice(&string_pool_size.to_le_bytes());
618        header_bytes[0x78..0x80].copy_from_slice(&name_index_offset.to_le_bytes());
619        header_bytes[0x80..0x88].copy_from_slice(&name_index_size.to_le_bytes());
620
621        let crc = crc32c::crc32c(&header_bytes[0..0xF8]);
622        header_bytes[0xF8..0xFC].copy_from_slice(&crc.to_le_bytes());
623
624        f.seek(SeekFrom::Start(0))?;
625        f.write_all(&header_bytes)?;
626        f.flush()?;
627        drop(f);
628
629        fs::rename(&tmp_path, &final_path)?;
630        
631        let _ = fs::remove_file(self.ix_dir.join("shard.ix.tmp.files"));
632        let _ = fs::remove_file(self.ix_dir.join("shard.ix.tmp.blooms"));
633        let _ = fs::remove_file(self.ix_dir.join("shard.ix.tmp.strings"));
634        let _ = fs::remove_file(&temp_trigrams_path);
635        for path in &self.temp_runs { let _ = fs::remove_file(path); }
636        self.temp_runs.clear();
637
638        Ok(final_path)
639    }
640
641    fn merge_to_run(&self, run_paths: &[PathBuf], out_path: &Path) -> Result<()> {
642        let mut runs = Vec::new();
643        for path in run_paths { runs.push(RunIterator::new(path)?); }
644        let mut heap = BinaryHeap::new();
645        let mut current_items = vec![None; runs.len()];
646        for (i, run) in runs.iter_mut().enumerate() {
647            if let Some(item) = run.next_trigram()? {
648                heap.push(MergeItem { tri: item.0, run_idx: i });
649                current_items[i] = Some(item);
650            }
651        }
652        let mut out = BufWriter::new(File::create(out_path)?);
653        let mut current_tri: Option<Trigram> = None;
654        let mut merged_entries: Vec<PostingEntry> = Vec::new();
655        while let Some(MergeItem { tri, run_idx }) = heap.pop() {
656            if Some(tri) != current_tri {
657                if let Some(t) = current_tri {
658                    self.write_run_entry(&mut out, t, &mut merged_entries)?;
659                    merged_entries.clear();
660                }
661                current_tri = Some(tri);
662            }
663            let item = current_items[run_idx].take().unwrap();
664            merged_entries.extend(item.1);
665            if let Some(next_item) = runs[run_idx].next_trigram()? {
666                heap.push(MergeItem { tri: next_item.0, run_idx });
667                current_items[run_idx] = Some(next_item);
668            }
669        }
670        if let Some(t) = current_tri { self.write_run_entry(&mut out, t, &mut merged_entries)?; }
671        out.flush()?;
672        Ok(())
673    }
674
675    fn write_run_entry<W: Write>(&self, w: &mut W, tri: Trigram, entries: &mut [PostingEntry]) -> Result<()> {
676        entries.sort_by_key(|e| e.file_id);
677        w.write_all(&tri.to_le_bytes())?;
678        w.write_all(&(entries.len() as u32).to_le_bytes())?;
679        for entry in entries {
680            w.write_all(&entry.file_id.to_le_bytes())?;
681            w.write_all(&(entry.offsets.len() as u32).to_le_bytes())?;
682            for off in &entry.offsets { w.write_all(&off.to_le_bytes())?; }
683        }
684        Ok(())
685    }
686
687    fn write_merged_posting<W: Write + Seek>(&self, f: &mut W, table: &mut W, tri: Trigram, base_off: u64, entries: &mut [PostingEntry]) -> Result<()> {
688        entries.sort_by_key(|e| e.file_id);
689        let count = entries.len() as u32;
690        let list = PostingList { entries: entries.to_vec() };
691        let encoded = list.encode();
692        let offset = f.stream_position()? - base_off;
693        f.write_all(&encoded)?;
694        let abs_off = base_off + offset;
695        table.write_all(&tri.to_le_bytes())?;
696        table.write_all(&abs_off.to_le_bytes()[..6])?;
697        table.write_all(&(encoded.len() as u32).to_le_bytes())?;
698        table.write_all(&count.to_le_bytes())?;
699        table.write_all(&[0u8; 2])?;
700        Ok(())
701    }
702
703    fn align_to_8<W: Write + Seek>(&self, mut w: W) -> std::io::Result<u64> {
704        let pos = w.stream_position()?;
705        let padding = (8 - (pos % 8)) % 8;
706        if padding > 0 { w.write_all(&vec![0u8; padding as usize])?; }
707        w.stream_position()
708    }
709}