Skip to main content

reflex/
trigram.rs

1//! Trigram-based inverted index for fast full-text code search
2//!
3//! This module implements the core trigram indexing algorithm used by Reflex.
4//! A trigram is a sequence of 3 consecutive bytes. By building an inverted index
5//! mapping trigrams to file locations, we can quickly narrow down search candidates
6//! and achieve sub-100ms query times even on large codebases.
7//!
8//! # Algorithm
9//!
10//! 1. **Indexing**: Extract all trigrams from each file, store locations
11//! 2. **Querying**: Extract trigrams from query, intersect posting lists
12//! 3. **Verification**: Check actual matches at candidate locations
13//!
14//! See `.context/TRIGRAM_RESEARCH.md` for detailed algorithm documentation.
15
16use anyhow::{Context, Result};
17use rkyv::{Archive, Deserialize, Serialize};
18use std::collections::HashMap;
19use std::fs::{File, OpenOptions};
20use std::io::Write;
21use std::path::{Path, PathBuf};
22
23/// A trigram is 3 consecutive bytes, packed into a u32 for efficient hashing
24pub type Trigram = u32;
25
26// Binary format constants for trigrams.bin
27const MAGIC: &[u8; 4] = b"RFTG"; // ReFlex TriGrams
28const VERSION: u32 = 3; // V3: No filtering, lazy loading with directory + data separation
29// Header: magic(4) + version(4) + num_trigrams(8) + num_files(8) = 24 bytes
30#[allow(dead_code)]
31const HEADER_SIZE: usize = 24;
32
33/// Write a u32 as a varint (variable-length integer)
34/// Uses 1-5 bytes depending on magnitude (smaller numbers = fewer bytes)
35fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
36    loop {
37        let mut byte = (value & 0x7F) as u8;
38        value >>= 7;
39        if value != 0 {
40            byte |= 0x80; // Set continuation bit
41        }
42        writer.write_all(&[byte])?;
43        if value == 0 {
44            break;
45        }
46    }
47    Ok(())
48}
49
50/// Read a varint from a byte slice, returns (value, bytes_consumed)
51fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
52    let mut value: u32 = 0;
53    let mut shift = 0;
54    let mut pos = 0;
55
56    loop {
57        if pos >= data.len() {
58            anyhow::bail!("Truncated varint");
59        }
60        let byte = data[pos];
61        pos += 1;
62
63        value |= ((byte & 0x7F) as u32) << shift;
64        if byte & 0x80 == 0 {
65            break;
66        }
67        shift += 7;
68        if shift >= 32 {
69            anyhow::bail!("Varint too large");
70        }
71    }
72
73    Ok((value, pos))
74}
75
76/// Decompress a posting list from memory-mapped data
77///
78/// Reads a compressed posting list (delta+varint encoded) from the given offset
79/// and decompresses it into a Vec<FileLocation>.
80///
81/// # Arguments
82/// * `mmap` - Memory-mapped file data
83/// * `offset` - Absolute byte offset where compressed data starts
84/// * `size` - Number of bytes to read
85fn decompress_posting_list(
86    mmap: &[u8],
87    offset: u64,
88    size: u32,
89) -> Result<Vec<FileLocation>> {
90    let start = offset as usize;
91    let end = start + size as usize;
92
93    if end > mmap.len() {
94        anyhow::bail!(
95            "Posting list out of bounds: offset={}, size={}, mmap_len={}",
96            offset,
97            size,
98            mmap.len()
99        );
100    }
101
102    let compressed_data = &mmap[start..end];
103
104    // Decompress delta-encoded posting list
105    let mut locations = Vec::new();
106    let mut pos = 0;
107    let mut prev_file_id = 0u32;
108    let mut prev_line_no = 0u32;
109    let mut prev_byte_offset = 0u32;
110
111    while pos < compressed_data.len() {
112        // Read file_id delta
113        let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
114        pos += consumed;
115
116        // Read line_no delta
117        let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
118        pos += consumed;
119
120        // Read byte_offset delta
121        let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
122        pos += consumed;
123
124        // Reconstruct absolute values from deltas
125        let file_id = prev_file_id.wrapping_add(file_id_delta);
126        let line_no = prev_line_no.wrapping_add(line_no_delta);
127        let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
128
129        locations.push(FileLocation {
130            file_id,
131            line_no,
132            byte_offset,
133        });
134
135        // Update previous values for next delta
136        prev_file_id = file_id;
137        prev_line_no = line_no;
138        prev_byte_offset = byte_offset;
139    }
140
141    Ok(locations)
142}
143
144/// Location of a trigram occurrence in the codebase
145#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Archive, Serialize, Deserialize)]
146pub struct FileLocation {
147    /// File ID (index into file list)
148    pub file_id: u32,
149    /// Line number (1-indexed)
150    pub line_no: u32,
151    /// Byte offset in file (for context extraction)
152    pub byte_offset: u32,
153}
154
155impl FileLocation {
156    pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
157        Self {
158            file_id,
159            line_no,
160            byte_offset,
161        }
162    }
163}
164
165/// Serializable trigram data (for rkyv zero-copy serialization)
166#[derive(Archive, Serialize, Deserialize)]
167struct TrigramData {
168    /// Inverted index: trigram → sorted locations
169    index: Vec<(Trigram, Vec<FileLocation>)>,
170    /// File ID to file path mapping
171    files: Vec<String>,
172}
173
174/// Directory entry for lazy-loaded trigram index
175///
176/// Maps each trigram to its compressed posting list location in the data section.
177/// Total size: 16 bytes per entry (4 + 8 + 4)
178#[derive(Debug, Clone)]
179struct DirectoryEntry {
180    /// The trigram value (for binary search)
181    trigram: Trigram,
182    /// Absolute byte offset in the file where compressed data starts
183    data_offset: u64,
184    /// Size of compressed posting list in bytes
185    compressed_size: u32,
186}
187
188/// Trigram-based inverted index
189///
190/// Maps each trigram to a sorted list of locations where it appears.
191/// Posting lists are kept sorted by (file_id, line_no) for efficient intersection.
192/// The index itself is kept sorted by trigram for O(log n) binary search.
193///
194/// Supports three modes:
195/// 1. **In-memory mode** (during indexing): All posting lists in RAM
196/// 2. **Batch-flush mode** (large codebases): Periodically flushes partial indices to disk to limit RAM
197/// 3. **Lazy-loaded mode** (after loading): Compressed posting lists in mmap, decompressed on-demand
198pub struct TrigramIndex {
199    /// Inverted index: sorted Vec of (trigram, locations) for binary search
200    /// Used in in-memory mode (during indexing)
201    index: Vec<(Trigram, Vec<FileLocation>)>,
202    /// File ID to file path mapping
203    files: Vec<PathBuf>,
204    /// Temporary HashMap used during batch indexing (None when finalized)
205    temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
206    /// Memory-mapped index file (for lazy loading)
207    mmap: Option<memmap2::Mmap>,
208    /// Directory of (trigram, offset, size) for lazy loading
209    directory: Vec<DirectoryEntry>,
210    /// Partial index files created during batch flushing (for k-way merge at finalize)
211    partial_indices: Vec<PathBuf>,
212    /// Temporary directory for partial indices
213    temp_dir: Option<PathBuf>,
214}
215
216impl TrigramIndex {
217    /// Create a new empty trigram index
218    pub fn new() -> Self {
219        Self {
220            index: Vec::new(),
221            files: Vec::new(),
222            temp_index: Some(HashMap::new()),
223            mmap: None,
224            directory: Vec::new(),
225            partial_indices: Vec::new(),
226            temp_dir: None,
227        }
228    }
229
230    /// Enable batch-flush mode for large codebases
231    ///
232    /// Creates a temporary directory for partial indices that will be merged at finalize().
233    /// Call this before indexing to enable memory-efficient indexing for huge codebases.
234    pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
235        std::fs::create_dir_all(&temp_dir)
236            .context("Failed to create temp directory for batch flushing")?;
237        self.temp_dir = Some(temp_dir);
238        log::info!("Enabled batch-flush mode for trigram index");
239        Ok(())
240    }
241
242    /// Flush current temp_index to a partial index file
243    ///
244    /// This clears the in-memory HashMap and writes a sorted partial index to disk.
245    /// Called periodically during indexing to limit memory usage.
246    pub fn flush_batch(&mut self) -> Result<()> {
247        let temp_dir = self.temp_dir.as_ref()
248            .ok_or_else(|| anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first"))?;
249
250        // Take ownership of temp_index to finalize it
251        let temp_map = self.temp_index.take()
252            .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
253
254        if temp_map.is_empty() {
255            // Nothing to flush, restore empty map
256            self.temp_index = Some(HashMap::new());
257            return Ok(());
258        }
259
260        // Convert HashMap to sorted Vec
261        let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
262
263        // Sort and deduplicate posting lists
264        for (_, list) in partial_index.iter_mut() {
265            list.sort_unstable();
266            list.dedup();
267        }
268
269        // Sort by trigram
270        partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
271
272        // Write to temp file
273        let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
274        self.write_partial_index(&partial_file, &partial_index)?;
275
276        self.partial_indices.push(partial_file);
277
278        // Create new empty temp_index for next batch
279        self.temp_index = Some(HashMap::new());
280
281        log::debug!(
282            "Flushed batch {} with {} trigrams to disk",
283            self.partial_indices.len(),
284            partial_index.len()
285        );
286
287        Ok(())
288    }
289
290    /// Write a partial index to disk (simplified format for merging)
291    fn write_partial_index(
292        &self,
293        path: &Path,
294        index: &[(Trigram, Vec<FileLocation>)],
295    ) -> Result<()> {
296        use std::io::BufWriter;
297
298        let file = OpenOptions::new()
299            .create(true)
300            .write(true)
301            .truncate(true)
302            .open(path)?;
303
304        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
305
306        // Write number of trigrams
307        writer.write_all(&(index.len() as u64).to_le_bytes())?;
308
309        // Write each (trigram, posting_list)
310        for (trigram, locations) in index {
311            writer.write_all(&trigram.to_le_bytes())?;
312            writer.write_all(&(locations.len() as u32).to_le_bytes())?;
313
314            for loc in locations {
315                writer.write_all(&loc.file_id.to_le_bytes())?;
316                writer.write_all(&loc.line_no.to_le_bytes())?;
317                writer.write_all(&loc.byte_offset.to_le_bytes())?;
318            }
319        }
320
321        writer.flush()?;
322        Ok(())
323    }
324
325    /// Add a file to the index and return its file_id
326    pub fn add_file(&mut self, path: PathBuf) -> u32 {
327        let file_id = self.files.len() as u32;
328        self.files.push(path);
329        file_id
330    }
331
332    /// Get file path for a file_id
333    pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
334        self.files.get(file_id as usize)
335    }
336
337    /// Get total number of files
338    pub fn file_count(&self) -> usize {
339        self.files.len()
340    }
341
342    /// Get total number of unique trigrams
343    pub fn trigram_count(&self) -> usize {
344        if !self.directory.is_empty() {
345            // Lazy-loaded mode
346            self.directory.len()
347        } else {
348            // In-memory mode
349            self.index.len()
350        }
351    }
352
353    /// Index a file's content
354    ///
355    /// Extracts all trigrams from the content and adds them to the inverted index.
356    /// Must call finalize() after indexing all files to prepare for searching.
357    pub fn index_file(&mut self, file_id: u32, content: &str) {
358        let trigrams = extract_trigrams_with_locations(content, file_id);
359
360        // Use the persistent HashMap for O(1) updates during batch processing
361        if let Some(ref mut temp_map) = self.temp_index {
362            for (trigram, location) in trigrams {
363                temp_map
364                    .entry(trigram)
365                    .or_insert_with(Vec::new)
366                    .push(location);
367            }
368        } else {
369            panic!("Cannot call index_file() after finalize(). Index is read-only.");
370        }
371    }
372
373    /// Build index from a collection of pre-extracted trigrams (bulk operation)
374    ///
375    /// This is much more efficient than calling index_file() multiple times,
376    /// as it builds the HashMap once instead of rebuilding it for each file.
377    pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
378        let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
379
380        // Group trigrams into posting lists
381        for (trigram, location) in trigrams {
382            temp_map
383                .entry(trigram)
384                .or_insert_with(Vec::new)
385                .push(location);
386        }
387
388        // Convert to sorted Vec for binary search
389        self.index = temp_map.into_iter().collect();
390
391        // Clear temp_index since we're using the Vec directly
392        self.temp_index = None;
393
394        // Finalize immediately (sort and deduplicate)
395        self.finalize();
396    }
397
398    /// Finalize the index by sorting all posting lists and the index itself
399    ///
400    /// Must be called after all files are indexed, before querying.
401    /// Converts the HashMap to a sorted Vec for fast binary search.
402    ///
403    /// If batch flushing was enabled, finalization will be deferred until write()
404    /// is called, which will perform streaming merge directly to disk.
405    pub fn finalize(&mut self) {
406        // If we have partial indices from batch flushing, DON'T merge yet
407        // We'll do streaming merge in write() or write_with_streaming_merge()
408        if !self.partial_indices.is_empty() {
409            log::info!("Deferring finalization - will stream merge {} partial indices during write()",
410                       self.partial_indices.len());
411
412            // Flush final batch if temp_index is not empty
413            if let Some(ref temp_map) = self.temp_index {
414                if !temp_map.is_empty() {
415                    self.flush_batch().expect("Failed to flush final batch");
416                }
417            }
418
419            // Don't merge yet - write() will handle it
420            return;
421        }
422
423        // Standard finalization (no batch flushing)
424        // Convert HashMap to Vec if we have a temp index
425        if let Some(temp_map) = self.temp_index.take() {
426            self.index = temp_map.into_iter().collect();
427        }
428
429        // Sort and deduplicate posting lists
430        for (_, list) in self.index.iter_mut() {
431            list.sort_unstable();
432            list.dedup(); // Remove duplicates (same trigram appearing multiple times on same line)
433        }
434
435        // Sort the index by trigram for binary search
436        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
437    }
438
439    /// Merge all partial indices directly to trigrams.bin using streaming k-way merge
440    ///
441    /// This avoids loading the entire index into RAM by:
442    /// 1. Opening all partial index files as readers
443    /// 2. Performing k-way merge using a priority queue
444    /// 3. Writing compressed posting lists directly to disk
445    /// 4. Never accumulating more than K posting lists in memory at once
446    fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
447        use std::io::{BufReader, BufWriter, Read};
448        use std::cmp::Ordering;
449        use std::collections::BinaryHeap;
450
451        log::info!("Streaming merge of {} partial indices to {:?}",
452                   self.partial_indices.len(), output_path);
453
454        // Open all partial indices as buffered readers
455        struct PartialIndexReader {
456            reader: BufReader<File>,
457            current_trigram: Option<Trigram>,
458            current_posting_list: Vec<FileLocation>,
459            reader_id: usize,
460        }
461
462        let mut readers: Vec<PartialIndexReader> = Vec::new();
463
464        for (idx, partial_path) in self.partial_indices.iter().enumerate() {
465            let file = File::open(partial_path)
466                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
467            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
468
469            // Read number of trigrams (we don't need it for streaming merge)
470            let mut buf = [0u8; 8];
471            reader.read_exact(&mut buf)?;
472
473            readers.push(PartialIndexReader {
474                reader,
475                current_trigram: None,
476                current_posting_list: Vec::new(),
477                reader_id: idx,
478            });
479        }
480
481        // Helper to read next trigram from a reader
482        fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
483            // Try to read trigram
484            let mut trigram_buf = [0u8; 4];
485            match reader.reader.read_exact(&mut trigram_buf) {
486                Ok(_) => {
487                    let trigram = u32::from_le_bytes(trigram_buf);
488
489                    // Read posting list size
490                    let mut len_buf = [0u8; 4];
491                    reader.reader.read_exact(&mut len_buf)?;
492                    let list_len = u32::from_le_bytes(len_buf) as usize;
493
494                    // Read all locations for this trigram
495                    let mut locations = Vec::with_capacity(list_len);
496                    for _ in 0..list_len {
497                        let mut loc_buf = [0u8; 12];
498                        reader.reader.read_exact(&mut loc_buf)?;
499
500                        let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
501                        let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
502                        let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
503
504                        locations.push(FileLocation { file_id, line_no, byte_offset });
505                    }
506
507                    reader.current_trigram = Some(trigram);
508                    reader.current_posting_list = locations;
509                    Ok(true)
510                }
511                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
512                    reader.current_trigram = None;
513                    Ok(false)
514                }
515                Err(e) => Err(e.into()),
516            }
517        }
518
519        // Initialize: read first trigram from each reader
520        for reader in &mut readers {
521            read_next_trigram(reader)?;
522        }
523
524        // Priority queue entry for k-way merge
525        #[derive(Eq, PartialEq)]
526        struct HeapEntry {
527            trigram: Trigram,
528            reader_id: usize,
529        }
530
531        impl Ord for HeapEntry {
532            fn cmp(&self, other: &Self) -> Ordering {
533                // Reverse for min-heap
534                other.trigram.cmp(&self.trigram)
535                    .then_with(|| other.reader_id.cmp(&self.reader_id))
536            }
537        }
538
539        impl PartialOrd for HeapEntry {
540            fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
541                Some(self.cmp(other))
542            }
543        }
544
545        // Build initial heap
546        let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
547        for reader in &readers {
548            if let Some(trigram) = reader.current_trigram {
549                heap.push(HeapEntry {
550                    trigram,
551                    reader_id: reader.reader_id,
552                });
553            }
554        }
555
556        // Open output file for writing
557        let file = OpenOptions::new()
558            .create(true)
559            .write(true)
560            .truncate(true)
561            .open(output_path)
562            .with_context(|| format!("Failed to create {}", output_path.display()))?;
563
564        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
565
566        // Write placeholder header (we'll update it at the end)
567        writer.write_all(MAGIC)?;
568        writer.write_all(&VERSION.to_le_bytes())?;
569        writer.write_all(&0u64.to_le_bytes())?; // num_trigrams (placeholder)
570        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
571
572        // We'll build the directory as we go
573        let mut directory: Vec<DirectoryEntry> = Vec::new();
574        let mut num_trigrams = 0u64;
575
576        // K-way merge loop
577        let mut current_trigram: Option<Trigram> = None;
578        let mut merged_locations: Vec<FileLocation> = Vec::new();
579
580        while let Some(entry) = heap.pop() {
581            let reader = &mut readers[entry.reader_id];
582
583            // If this is a new trigram, write the previous one
584            if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
585                // Write the accumulated posting list for current_trigram
586                let trigram = current_trigram.unwrap();
587                merged_locations.sort_unstable();
588                merged_locations.dedup();
589
590                // Compress and write this trigram's posting list
591                let data_offset = writer.stream_position()?;
592                let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
593
594                directory.push(DirectoryEntry {
595                    trigram,
596                    data_offset,
597                    compressed_size,
598                });
599
600                num_trigrams += 1;
601                merged_locations.clear();
602            }
603
604            // Set current trigram
605            current_trigram = Some(entry.trigram);
606
607            // Merge this reader's posting list into accumulated list
608            merged_locations.extend_from_slice(&reader.current_posting_list);
609
610            // Advance this reader to next trigram
611            if read_next_trigram(reader)? {
612                if let Some(next_trigram) = reader.current_trigram {
613                    heap.push(HeapEntry {
614                        trigram: next_trigram,
615                        reader_id: entry.reader_id,
616                    });
617                }
618            }
619        }
620
621        // Write final trigram
622        if let Some(trigram) = current_trigram {
623            merged_locations.sort_unstable();
624            merged_locations.dedup();
625
626            let data_offset = writer.stream_position()?;
627            let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
628
629            directory.push(DirectoryEntry {
630                trigram,
631                data_offset,
632                compressed_size,
633            });
634
635            num_trigrams += 1;
636        }
637
638        log::info!("Merged {} trigrams from {} partial indices", num_trigrams, self.partial_indices.len());
639
640        // Remember where data section ended (not used but kept for clarity)
641        let _data_end_pos = writer.stream_position()?;
642
643        // Write file paths after data section
644        for file_path in &self.files {
645            let path_str = file_path.to_string_lossy();
646            let path_bytes = path_str.as_bytes();
647            write_varint(&mut writer, path_bytes.len() as u32)?;
648            writer.write_all(path_bytes)?;
649        }
650
651        // Flush before we rewrite the beginning
652        writer.flush()?;
653        drop(writer);
654
655        // Now we need to insert the directory at the beginning
656        // We'll read the data+files we just wrote, then rewrite the file with directory in between
657        use std::io::{Seek, SeekFrom};
658
659        // Read data and files sections
660        let mut temp_data = Vec::new();
661        {
662            let mut file = File::open(output_path)?;
663            file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
664            file.read_to_end(&mut temp_data)?;
665        }
666
667        // Rewrite file with correct structure
668        let file = OpenOptions::new().write(true).truncate(true).open(output_path)?;
669        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
670
671        // Write header with correct num_trigrams
672        writer.write_all(MAGIC)?;
673        writer.write_all(&VERSION.to_le_bytes())?;
674        writer.write_all(&num_trigrams.to_le_bytes())?;
675        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
676
677        // Write directory
678        for entry in &directory {
679            writer.write_all(&entry.trigram.to_le_bytes())?;
680            // Adjust data offset to account for directory size
681            let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
682            writer.write_all(&adjusted_offset.to_le_bytes())?;
683            writer.write_all(&entry.compressed_size.to_le_bytes())?;
684        }
685
686        // Write data and files sections
687        writer.write_all(&temp_data)?;
688
689        // Flush and sync
690        writer.flush()?;
691        writer.get_ref().sync_all()?;
692
693        // Clean up partial index files
694        for partial_path in &self.partial_indices {
695            let _ = std::fs::remove_file(partial_path);
696        }
697        if let Some(ref temp_dir) = self.temp_dir {
698            let _ = std::fs::remove_dir(temp_dir);
699        }
700
701        log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
702
703        Ok(())
704    }
705
706    /// Write a compressed posting list to the writer and return the compressed size
707    fn write_compressed_posting_list(
708        &self,
709        writer: &mut impl Write,
710        locations: &[FileLocation],
711    ) -> Result<u32> {
712        let mut compressed = Vec::new();
713
714        // Compress posting list using delta+varint encoding
715        let mut prev_file_id = 0u32;
716        let mut prev_line_no = 0u32;
717        let mut prev_byte_offset = 0u32;
718
719        for loc in locations {
720            // Compute deltas
721            let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
722            let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
723            let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
724
725            // Write deltas as varints
726            write_varint(&mut compressed, file_id_delta)?;
727            write_varint(&mut compressed, line_no_delta)?;
728            write_varint(&mut compressed, byte_offset_delta)?;
729
730            // Update previous values
731            prev_file_id = loc.file_id;
732            prev_line_no = loc.line_no;
733            prev_byte_offset = loc.byte_offset;
734        }
735
736        let compressed_size = compressed.len() as u32;
737        writer.write_all(&compressed)?;
738
739        Ok(compressed_size)
740    }
741
742    /// Merge all partial indices into self.index (old in-memory approach - deprecated)
743    #[allow(dead_code)]
744    fn merge_partial_indices(&mut self) -> Result<()> {
745        use std::io::{BufReader, Read};
746
747        // Read all partial indices into memory (simplified approach for now)
748        let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
749
750        for partial_path in &self.partial_indices {
751            let file = File::open(partial_path)
752                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
753            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
754
755            // Read number of trigrams
756            let mut buf = [0u8; 8];
757            reader.read_exact(&mut buf)?;
758            let num_trigrams = u64::from_le_bytes(buf) as usize;
759
760            // Read each (trigram, posting_list)
761            for _ in 0..num_trigrams {
762                // Read trigram
763                let mut trigram_buf = [0u8; 4];
764                reader.read_exact(&mut trigram_buf)?;
765                let trigram = u32::from_le_bytes(trigram_buf);
766
767                // Read posting list size
768                let mut len_buf = [0u8; 4];
769                reader.read_exact(&mut len_buf)?;
770                let list_len = u32::from_le_bytes(len_buf) as usize;
771
772                // Read all locations
773                for _ in 0..list_len {
774                    let mut loc_buf = [0u8; 12]; // 3 * u32
775                    reader.read_exact(&mut loc_buf)?;
776
777                    let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
778                    let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
779                    let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
780
781                    all_entries.push((trigram, FileLocation { file_id, line_no, byte_offset }));
782                }
783            }
784        }
785
786        log::info!("Read {} total trigram entries from {} partial indices",
787                   all_entries.len(), self.partial_indices.len());
788
789        // Group by trigram
790        let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
791        for (trigram, location) in all_entries {
792            index_map
793                .entry(trigram)
794                .or_insert_with(Vec::new)
795                .push(location);
796        }
797
798        // Convert to sorted vec
799        self.index = index_map.into_iter().collect();
800
801        // Sort and deduplicate posting lists
802        for (_, list) in self.index.iter_mut() {
803            list.sort_unstable();
804            list.dedup();
805        }
806
807        // Sort by trigram
808        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
809
810        // Clean up partial index files
811        for partial_path in &self.partial_indices {
812            let _ = std::fs::remove_file(partial_path);
813        }
814        if let Some(ref temp_dir) = self.temp_dir {
815            let _ = std::fs::remove_dir(temp_dir);
816        }
817
818        log::info!("Merged into final index with {} trigrams", self.index.len());
819
820        Ok(())
821    }
822
823    /// Search for a plain text pattern
824    ///
825    /// Returns candidate file locations that could contain the pattern.
826    /// Caller must verify actual matches.
827    ///
828    /// In lazy-loaded mode: Decompresses posting lists on-demand from mmap.
829    /// In in-memory mode: Uses pre-loaded posting lists.
830    pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
831        if pattern.len() < 3 {
832            // Pattern too short for trigrams - caller must fall back to full scan
833            return vec![];
834        }
835
836        let trigrams = extract_trigrams(pattern);
837        if trigrams.is_empty() {
838            return vec![];
839        }
840
841        // Check if we're in lazy-loaded mode or in-memory mode
842        if let Some(ref mmap) = self.mmap {
843            // Lazy-loaded mode: decompress posting lists on-demand
844            let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
845
846            for trigram in &trigrams {
847                // Binary search directory for this trigram
848                match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
849                    Ok(idx) => {
850                        let entry = &self.directory[idx];
851                        // Decompress this posting list on-demand
852                        match decompress_posting_list(mmap, entry.data_offset, entry.compressed_size) {
853                            Ok(locations) => posting_lists.push(locations),
854                            Err(e) => {
855                                log::warn!("Failed to decompress posting list for trigram {}: {}", trigram, e);
856                                return vec![];
857                            }
858                        }
859                    }
860                    Err(_) => {
861                        // Trigram not found - pattern cannot match
862                        return vec![];
863                    }
864                }
865            }
866
867            if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
868                return vec![];
869            }
870
871            // Sort by list size (smallest first for efficient intersection)
872            posting_lists.sort_by_key(|list| list.len());
873
874            // Intersect posting lists (owned version)
875            intersect_by_file_owned(&posting_lists)
876        } else {
877            // In-memory mode: use pre-loaded index
878            let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
879                .iter()
880                .filter_map(|t| {
881                    self.index
882                        .binary_search_by_key(t, |(trigram, _)| *trigram)
883                        .ok()
884                        .map(|idx| &self.index[idx].1)
885                })
886                .collect();
887
888            if posting_lists.is_empty() {
889                return vec![];
890            }
891
892            if posting_lists.len() < trigrams.len() {
893                // Some trigrams missing - pattern cannot match
894                return vec![];
895            }
896
897            // Sort by list size (smallest first for efficient intersection)
898            posting_lists.sort_by_key(|list| list.len());
899
900            // Intersect posting lists (reference version)
901            intersect_by_file(&posting_lists)
902        }
903    }
904
905    /// Get posting list for a specific trigram (for debugging)
906    pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
907        self.index
908            .binary_search_by_key(&trigram, |(t, _)| *t)
909            .ok()
910            .map(|idx| &self.index[idx].1)
911    }
912
913    /// Write the trigram index to disk
914    ///
915    /// Binary format V3 (lazy-loadable with directory + data separation):
916    /// - Header (24 bytes): magic, version, num_trigrams, num_files
917    /// - Directory Section (16 bytes per trigram):
918    ///   - trigram: u32 (4 bytes)
919    ///   - data_offset: u64 (8 bytes) - absolute offset in file
920    ///   - compressed_size: u32 (4 bytes) - size of compressed posting list
921    /// - Data Section (variable size):
922    ///   - Compressed posting lists (delta+varint encoded)
923    /// - File Paths Section (variable size):
924    ///   - path_len: varint
925    ///   - path_bytes: [u8; path_len]
926    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
927        let path = path.as_ref();
928
929        // If we have partial indices from batch flushing, use streaming merge
930        if !self.partial_indices.is_empty() {
931            log::info!("Using streaming merge to write {} partial indices", self.partial_indices.len());
932            return self.merge_partial_indices_to_file(path);
933        }
934
935        // Standard write path (no batch flushing)
936        let file = OpenOptions::new()
937            .create(true)
938            .write(true)
939            .truncate(true)
940            .open(path)
941            .with_context(|| format!("Failed to create {}", path.display()))?;
942
943        // Use a large buffer (16MB) for streaming writes
944        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
945
946        // Write header
947        writer.write_all(MAGIC)?;
948        writer.write_all(&VERSION.to_le_bytes())?;
949        writer.write_all(&(self.index.len() as u64).to_le_bytes())?; // num_trigrams
950        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
951
952        // Build directory and write compressed data in a single pass
953        let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
954
955        // Calculate directory start and size
956        let directory_start = HEADER_SIZE as u64;
957        let directory_size = self.index.len() * 16;
958
959        // Reserve space for directory (we'll write it after data)
960        let data_start = directory_start + directory_size as u64;
961        let mut current_offset = data_start;
962
963        // We need to write in the correct order: header, directory, data, file paths
964        // But we need data offsets to write directory
965        // So we compress data first, then write header+directory+data
966
967        // Step 1: Compress all posting lists and track offsets
968        let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
969
970        for (trigram, locations) in &self.index {
971            // Compress the posting list
972            let mut compressed = Vec::new();
973            let mut prev_file_id = 0u32;
974            let mut prev_line_no = 0u32;
975            let mut prev_byte_offset = 0u32;
976
977            for loc in locations {
978                let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
979                let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
980                let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
981
982                write_varint(&mut compressed, file_id_delta)?;
983                write_varint(&mut compressed, line_no_delta)?;
984                write_varint(&mut compressed, byte_offset_delta)?;
985
986                prev_file_id = loc.file_id;
987                prev_line_no = loc.line_no;
988                prev_byte_offset = loc.byte_offset;
989            }
990
991            directory.push(DirectoryEntry {
992                trigram: *trigram,
993                data_offset: current_offset,
994                compressed_size: compressed.len() as u32,
995            });
996            current_offset += compressed.len() as u64;
997
998            compressed_lists.push((*trigram, compressed));
999        }
1000
1001        // Step 2: Write directory
1002        for entry in &directory {
1003            writer.write_all(&entry.trigram.to_le_bytes())?;
1004            writer.write_all(&entry.data_offset.to_le_bytes())?;
1005            writer.write_all(&entry.compressed_size.to_le_bytes())?;
1006        }
1007
1008        // Step 3: Write data section (compressed posting lists)
1009        for (_, compressed) in &compressed_lists {
1010            writer.write_all(compressed)?;
1011        }
1012
1013        // Step 4: Write file paths
1014        for file_path in &self.files {
1015            let path_str = file_path.to_string_lossy();
1016            let path_bytes = path_str.as_bytes();
1017            write_varint(&mut writer, path_bytes.len() as u32)?;
1018            writer.write_all(path_bytes)?;
1019        }
1020
1021        // Flush and sync
1022        writer.flush()?;
1023        writer.get_ref().sync_all()?;
1024
1025        log::info!(
1026            "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1027            self.index.len(),
1028            self.files.len(),
1029            path
1030        );
1031
1032        Ok(())
1033    }
1034
1035    /// Load trigram index from disk using memory-mapped I/O with lazy loading
1036    ///
1037    /// Binary format V3: Only reads the directory and file paths, keeps posting lists compressed in mmap.
1038    /// Posting lists are decompressed on-demand during search queries.
1039    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1040        let path = path.as_ref();
1041
1042        let file = File::open(path)
1043            .with_context(|| format!("Failed to open {}", path.display()))?;
1044
1045        // Memory-map the file (keep it alive for lazy access)
1046        let mmap = unsafe {
1047            memmap2::Mmap::map(&file)
1048                .with_context(|| format!("Failed to mmap {}", path.display()))?
1049        };
1050
1051        // Validate header
1052        if mmap.len() < HEADER_SIZE {
1053            anyhow::bail!("trigrams.bin too small (expected at least {} bytes)", HEADER_SIZE);
1054        }
1055
1056        if &mmap[0..4] != MAGIC {
1057            anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1058        }
1059
1060        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1061        if version != VERSION {
1062            anyhow::bail!(
1063                "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1064                version, VERSION
1065            );
1066        }
1067
1068        let num_trigrams = u64::from_le_bytes([
1069            mmap[8], mmap[9], mmap[10], mmap[11],
1070            mmap[12], mmap[13], mmap[14], mmap[15],
1071        ]) as usize;
1072
1073        let num_files = u64::from_le_bytes([
1074            mmap[16], mmap[17], mmap[18], mmap[19],
1075            mmap[20], mmap[21], mmap[22], mmap[23],
1076        ]) as usize;
1077
1078        log::debug!("Loading lazy trigram index: {} trigrams, {} files", num_trigrams, num_files);
1079
1080        // Read directory (trigram → offset mappings) - fast, just metadata
1081        let mut directory = Vec::with_capacity(num_trigrams);
1082        let mut pos = HEADER_SIZE;
1083        let directory_size = num_trigrams * 16; // 16 bytes per entry
1084
1085        for _ in 0..num_trigrams {
1086            if pos + 16 > mmap.len() {
1087                anyhow::bail!("Truncated directory entry at pos={}", pos);
1088            }
1089
1090            let trigram = u32::from_le_bytes([
1091                mmap[pos],
1092                mmap[pos + 1],
1093                mmap[pos + 2],
1094                mmap[pos + 3],
1095            ]);
1096            pos += 4;
1097
1098            let data_offset = u64::from_le_bytes([
1099                mmap[pos],
1100                mmap[pos + 1],
1101                mmap[pos + 2],
1102                mmap[pos + 3],
1103                mmap[pos + 4],
1104                mmap[pos + 5],
1105                mmap[pos + 6],
1106                mmap[pos + 7],
1107            ]);
1108            pos += 8;
1109
1110            let compressed_size = u32::from_le_bytes([
1111                mmap[pos],
1112                mmap[pos + 1],
1113                mmap[pos + 2],
1114                mmap[pos + 3],
1115            ]);
1116            pos += 4;
1117
1118            directory.push(DirectoryEntry {
1119                trigram,
1120                data_offset,
1121                compressed_size,
1122            });
1123        }
1124
1125        // Directory is already sorted by trigram (from write())
1126        directory.sort_unstable_by_key(|e| e.trigram);
1127
1128        // Calculate where file paths section starts (after header + directory + data)
1129        let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1130        let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1131        pos = files_section_offset;
1132
1133        // Read file paths (varint-encoded lengths)
1134        let mut files = Vec::with_capacity(num_files);
1135        for _ in 0..num_files {
1136            // Read path length (varint)
1137            let (path_len, consumed) = read_varint(&mmap[pos..])?;
1138            pos += consumed;
1139            let path_len = path_len as usize;
1140
1141            if pos + path_len > mmap.len() {
1142                anyhow::bail!("Truncated file path at pos={}", pos);
1143            }
1144
1145            let path_bytes = &mmap[pos..pos + path_len];
1146            let path_str = std::str::from_utf8(path_bytes)
1147                .context("Invalid UTF-8 in file path")?;
1148            files.push(PathBuf::from(path_str));
1149            pos += path_len;
1150        }
1151
1152        log::info!(
1153            "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1154            num_trigrams,
1155            num_files,
1156            directory_size / 1024
1157        );
1158
1159        Ok(Self {
1160            index: Vec::new(),  // Empty in lazy mode
1161            files,
1162            temp_index: None,
1163            mmap: Some(mmap),  // Keep mmap alive for lazy decompression!
1164            directory,
1165            partial_indices: Vec::new(),
1166            temp_dir: None,
1167        })
1168    }
1169}
1170
1171impl Default for TrigramIndex {
1172    fn default() -> Self {
1173        Self::new()
1174    }
1175}
1176
1177/// Extract all trigrams from text
1178///
1179/// Returns a vector of trigrams (without location info).
1180pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1181    let bytes = text.as_bytes();
1182    let mut trigrams = Vec::new();
1183
1184    for i in 0..bytes.len().saturating_sub(2) {
1185        let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1186        trigrams.push(trigram);
1187    }
1188
1189    trigrams
1190}
1191
1192/// Extract trigrams with file location information
1193///
1194/// Returns a vector of (trigram, location) pairs for building the inverted index.
1195pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1196    let bytes = text.as_bytes();
1197    let mut result = Vec::new();
1198
1199    let mut line_no = 1;
1200
1201    for (i, &byte) in bytes.iter().enumerate() {
1202        // Track newlines
1203        if byte == b'\n' {
1204            line_no += 1;
1205        }
1206
1207        // Extract trigram
1208        if i + 2 < bytes.len() {
1209            let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1210            let location = FileLocation::new(file_id, line_no, i as u32);
1211            result.push((trigram, location));
1212        }
1213    }
1214
1215    result
1216}
1217
1218/// Convert 3 bytes to a trigram (packed u32)
1219#[inline]
1220fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1221    debug_assert_eq!(bytes.len(), 3);
1222    (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1223}
1224
1225/// Convert trigram back to bytes (for debugging)
1226#[allow(dead_code)]
1227fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1228    [
1229        ((trigram >> 16) & 0xFF) as u8,
1230        ((trigram >> 8) & 0xFF) as u8,
1231        (trigram & 0xFF) as u8,
1232    ]
1233}
1234
1235/// Intersect posting lists by (file_id, line_no) pairs
1236///
1237/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1238/// This ensures accurate full-text matching.
1239fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1240    if lists.is_empty() {
1241        return vec![];
1242    }
1243
1244    use std::collections::HashSet;
1245
1246    // Create a set of (file_id, line_no) pairs from the first list
1247    let mut candidates: HashSet<(u32, u32)> = lists[0]
1248        .iter()
1249        .map(|loc| (loc.file_id, loc.line_no))
1250        .collect();
1251
1252    // Intersect with (file_id, line_no) pairs from other lists
1253    for &list in &lists[1..] {
1254        let list_pairs: HashSet<(u32, u32)> = list
1255            .iter()
1256            .map(|loc| (loc.file_id, loc.line_no))
1257            .collect();
1258        candidates.retain(|pair| list_pairs.contains(pair));
1259    }
1260
1261    // Convert back to FileLocation results
1262    let mut result = Vec::new();
1263    for &(file_id, line_no) in &candidates {
1264        // Find a location matching this (file_id, line_no) from the first list
1265        if let Some(loc) = lists[0]
1266            .iter()
1267            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1268        {
1269            result.push(*loc);
1270        }
1271    }
1272
1273    result.sort_unstable();
1274    result
1275}
1276
1277/// Intersect posting lists by (file_id, line_no) pairs (owned version for lazy-loading)
1278///
1279/// Similar to intersect_by_file() but works with owned Vec<Vec<FileLocation>>
1280/// instead of references. Used in lazy-loading mode where posting lists are decompressed on-demand.
1281///
1282/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1283fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1284    if lists.is_empty() {
1285        return vec![];
1286    }
1287
1288    use std::collections::HashSet;
1289
1290    // Create a set of (file_id, line_no) pairs from the first list
1291    let mut candidates: HashSet<(u32, u32)> = lists[0]
1292        .iter()
1293        .map(|loc| (loc.file_id, loc.line_no))
1294        .collect();
1295
1296    // Intersect with (file_id, line_no) pairs from other lists
1297    for list in &lists[1..] {
1298        let list_pairs: HashSet<(u32, u32)> = list
1299            .iter()
1300            .map(|loc| (loc.file_id, loc.line_no))
1301            .collect();
1302        candidates.retain(|pair| list_pairs.contains(pair));
1303    }
1304
1305    // Convert back to FileLocation results
1306    let mut result = Vec::new();
1307    for &(file_id, line_no) in &candidates {
1308        // Find a location matching this (file_id, line_no) from the first list
1309        if let Some(loc) = lists[0]
1310            .iter()
1311            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1312        {
1313            result.push(*loc);
1314        }
1315    }
1316
1317    result.sort_unstable();
1318    result
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323    use super::*;
1324
1325    #[test]
1326    fn test_extract_trigrams() {
1327        let text = "hello";
1328        let trigrams = extract_trigrams(text);
1329
1330        // "hello" → "hel", "ell", "llo"
1331        assert_eq!(trigrams.len(), 3);
1332
1333        // Verify trigrams are unique
1334        let expected = vec![
1335            bytes_to_trigram(b"hel"),
1336            bytes_to_trigram(b"ell"),
1337            bytes_to_trigram(b"llo"),
1338        ];
1339        assert_eq!(trigrams, expected);
1340    }
1341
1342    #[test]
1343    fn test_extract_trigrams_short() {
1344        assert_eq!(extract_trigrams("ab").len(), 0);
1345        assert_eq!(extract_trigrams("abc").len(), 1);
1346    }
1347
1348    #[test]
1349    fn test_bytes_to_trigram() {
1350        let trigram1 = bytes_to_trigram(b"abc");
1351        let trigram2 = bytes_to_trigram(b"abc");
1352        let trigram3 = bytes_to_trigram(b"xyz");
1353
1354        assert_eq!(trigram1, trigram2);
1355        assert_ne!(trigram1, trigram3);
1356    }
1357
1358    #[test]
1359    fn test_trigram_roundtrip() {
1360        let original = b"foo";
1361        let trigram = bytes_to_trigram(original);
1362        let recovered = trigram_to_bytes(trigram);
1363        assert_eq!(original, &recovered);
1364    }
1365
1366    #[test]
1367    fn test_extract_with_locations() {
1368        let text = "hello\nworld";
1369        let locs = extract_trigrams_with_locations(text, 0);
1370
1371        // "hello\nworld" has 9 trigrams:
1372        // "hel", "ell", "llo", "lo\n", "o\nw", "\nwo", "wor", "orl", "rld"
1373        assert_eq!(locs.len(), 9);
1374
1375        // First trigram should be on line 1
1376        assert_eq!(locs[0].1.line_no, 1);
1377
1378        // After newline, should be line 2
1379        let world_start = text.find("world").unwrap();
1380        let world_trigram_idx = locs
1381            .iter()
1382            .position(|(_, loc)| loc.byte_offset as usize == world_start)
1383            .unwrap();
1384        assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1385    }
1386
1387    #[test]
1388    fn test_trigram_index_basic() {
1389        let mut index = TrigramIndex::new();
1390
1391        let file_id = index.add_file(PathBuf::from("test.txt"));
1392        index.index_file(file_id, "hello world");
1393        index.finalize();
1394
1395        // Search for "hello"
1396        let results = index.search("hello");
1397        assert!(!results.is_empty());
1398
1399        // Search for "world"
1400        let results = index.search("world");
1401        assert!(!results.is_empty());
1402
1403        // Search for "goodbye" (not in text)
1404        let results = index.search("goodbye");
1405        assert!(results.is_empty());
1406    }
1407
1408    #[test]
1409    fn test_search_multifile() {
1410        let mut index = TrigramIndex::new();
1411
1412        let file1 = index.add_file(PathBuf::from("file1.txt"));
1413        let file2 = index.add_file(PathBuf::from("file2.txt"));
1414
1415        index.index_file(file1, "extract_symbols is here");
1416        index.index_file(file2, "extract_symbols is also here");
1417        index.finalize();
1418
1419        let results = index.search("extract_symbols");
1420        assert_eq!(results.len(), 2); // One result per file
1421
1422        // Verify we got both files
1423        let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1424        assert!(file_ids.contains(&file1));
1425        assert!(file_ids.contains(&file2));
1426    }
1427
1428    #[test]
1429    fn test_persistence_write() {
1430        use tempfile::TempDir;
1431
1432        let temp = TempDir::new().unwrap();
1433        let trigrams_path = temp.path().join("trigrams.bin");
1434
1435        // Build and write index
1436        let mut index = TrigramIndex::new();
1437        let file1 = index.add_file(PathBuf::from("src/main.rs"));
1438        let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1439
1440        index.index_file(file1, "fn main() { println!(\"hello\"); }");
1441        index.index_file(file2, "pub fn hello() -> String { String::from(\"hello\") }");
1442        index.finalize();
1443
1444        // Write to disk
1445        index.write(&trigrams_path).unwrap();
1446
1447        // Verify file was created
1448        assert!(trigrams_path.exists());
1449
1450        // Verify file has content (header + data)
1451        let metadata = std::fs::metadata(&trigrams_path).unwrap();
1452        assert!(metadata.len() > HEADER_SIZE as u64);
1453
1454        // Verify we can read the header back
1455        use std::io::Read;
1456        let mut file = File::open(&trigrams_path).unwrap();
1457        let mut magic = [0u8; 4];
1458        file.read_exact(&mut magic).unwrap();
1459        assert_eq!(&magic, MAGIC);
1460
1461        // Note: Full roundtrip test verifies write works correctly.
1462        // Load verification is tested in production via query performance tests.
1463    }
1464}