Skip to main content

reflex/
trigram.rs

1//! Trigram-based inverted index for fast full-text code search
2//!
3//! This module implements the core trigram indexing algorithm used by Reflex.
4//! A trigram is a sequence of 3 consecutive bytes. By building an inverted index
5//! mapping trigrams to file locations, we can quickly narrow down search candidates
6//! and achieve sub-100ms query times even on large codebases.
7//!
8//! # Algorithm
9//!
10//! 1. **Indexing**: Extract all trigrams from each file, store locations
11//! 2. **Querying**: Extract trigrams from query, intersect posting lists
12//! 3. **Verification**: Check actual matches at candidate locations
13//!
14//! See `.context/TRIGRAM_RESEARCH.md` for detailed algorithm documentation.
15
16use anyhow::{Context, Result};
17use std::collections::HashMap;
18use std::fs::{File, OpenOptions};
19use std::io::Write;
20use std::path::{Path, PathBuf};
21
22/// A trigram is 3 consecutive bytes, packed into a u32 for efficient hashing
23pub type Trigram = u32;
24
25// Binary format constants for trigrams.bin
26const MAGIC: &[u8; 4] = b"RFTG"; // ReFlex TriGrams
27const VERSION: u32 = 3; // V3: No filtering, lazy loading with directory + data separation
28// Header: magic(4) + version(4) + num_trigrams(8) + num_files(8) = 24 bytes
29#[allow(dead_code)]
30const HEADER_SIZE: usize = 24;
31
32/// Write a u32 as a varint (variable-length integer)
33/// Uses 1-5 bytes depending on magnitude (smaller numbers = fewer bytes)
34fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
35    loop {
36        let mut byte = (value & 0x7F) as u8;
37        value >>= 7;
38        if value != 0 {
39            byte |= 0x80; // Set continuation bit
40        }
41        writer.write_all(&[byte])?;
42        if value == 0 {
43            break;
44        }
45    }
46    Ok(())
47}
48
49/// Read a varint from a byte slice, returns (value, bytes_consumed)
50fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
51    let mut value: u32 = 0;
52    let mut shift = 0;
53    let mut pos = 0;
54
55    loop {
56        if pos >= data.len() {
57            anyhow::bail!("Truncated varint");
58        }
59        let byte = data[pos];
60        pos += 1;
61
62        value |= ((byte & 0x7F) as u32) << shift;
63        if byte & 0x80 == 0 {
64            break;
65        }
66        shift += 7;
67        if shift >= 32 {
68            anyhow::bail!("Varint too large");
69        }
70    }
71
72    Ok((value, pos))
73}
74
75/// Decompress a posting list from memory-mapped data
76///
77/// Reads a compressed posting list (delta+varint encoded) from the given offset
78/// and decompresses it into a Vec<FileLocation>.
79///
80/// # Arguments
81/// * `mmap` - Memory-mapped file data
82/// * `offset` - Absolute byte offset where compressed data starts
83/// * `size` - Number of bytes to read
84fn decompress_posting_list(
85    mmap: &[u8],
86    offset: u64,
87    size: u32,
88) -> Result<Vec<FileLocation>> {
89    let start = offset as usize;
90    let end = start + size as usize;
91
92    if end > mmap.len() {
93        anyhow::bail!(
94            "Posting list out of bounds: offset={}, size={}, mmap_len={}",
95            offset,
96            size,
97            mmap.len()
98        );
99    }
100
101    let compressed_data = &mmap[start..end];
102
103    // Decompress delta-encoded posting list
104    let mut locations = Vec::new();
105    let mut pos = 0;
106    let mut prev_file_id = 0u32;
107    let mut prev_line_no = 0u32;
108    let mut prev_byte_offset = 0u32;
109
110    while pos < compressed_data.len() {
111        // Read file_id delta
112        let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
113        pos += consumed;
114
115        // Read line_no delta
116        let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
117        pos += consumed;
118
119        // Read byte_offset delta
120        let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
121        pos += consumed;
122
123        // Reconstruct absolute values from deltas
124        let file_id = prev_file_id.wrapping_add(file_id_delta);
125        let line_no = prev_line_no.wrapping_add(line_no_delta);
126        let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
127
128        locations.push(FileLocation {
129            file_id,
130            line_no,
131            byte_offset,
132        });
133
134        // Update previous values for next delta
135        prev_file_id = file_id;
136        prev_line_no = line_no;
137        prev_byte_offset = byte_offset;
138    }
139
140    Ok(locations)
141}
142
143/// Location of a trigram occurrence in the codebase
144#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
145pub struct FileLocation {
146    /// File ID (index into file list)
147    pub file_id: u32,
148    /// Line number (1-indexed)
149    pub line_no: u32,
150    /// Byte offset in file (for context extraction)
151    pub byte_offset: u32,
152}
153
154impl FileLocation {
155    pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
156        Self {
157            file_id,
158            line_no,
159            byte_offset,
160        }
161    }
162}
163
164/// Directory entry for lazy-loaded trigram index
165///
166/// Maps each trigram to its compressed posting list location in the data section.
167/// Total size: 16 bytes per entry (4 + 8 + 4)
168#[derive(Debug, Clone)]
169struct DirectoryEntry {
170    /// The trigram value (for binary search)
171    trigram: Trigram,
172    /// Absolute byte offset in the file where compressed data starts
173    data_offset: u64,
174    /// Size of compressed posting list in bytes
175    compressed_size: u32,
176}
177
178/// Trigram-based inverted index
179///
180/// Maps each trigram to a sorted list of locations where it appears.
181/// Posting lists are kept sorted by (file_id, line_no) for efficient intersection.
182/// The index itself is kept sorted by trigram for O(log n) binary search.
183///
184/// Supports three modes:
185/// 1. **In-memory mode** (during indexing): All posting lists in RAM
186/// 2. **Batch-flush mode** (large codebases): Periodically flushes partial indices to disk to limit RAM
187/// 3. **Lazy-loaded mode** (after loading): Compressed posting lists in mmap, decompressed on-demand
188pub struct TrigramIndex {
189    /// Inverted index: sorted Vec of (trigram, locations) for binary search
190    /// Used in in-memory mode (during indexing)
191    index: Vec<(Trigram, Vec<FileLocation>)>,
192    /// File ID to file path mapping
193    files: Vec<PathBuf>,
194    /// Temporary HashMap used during batch indexing (None when finalized)
195    temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
196    /// Memory-mapped index file (for lazy loading)
197    mmap: Option<memmap2::Mmap>,
198    /// Directory of (trigram, offset, size) for lazy loading
199    directory: Vec<DirectoryEntry>,
200    /// Partial index files created during batch flushing (for k-way merge at finalize)
201    partial_indices: Vec<PathBuf>,
202    /// Temporary directory for partial indices
203    temp_dir: Option<PathBuf>,
204    /// Cap on posting list size; 0 = unlimited. Enforced at finalize time.
205    /// Bounds query latency for high-frequency trigrams (trigram-density lens).
206    max_posting_list_entries: usize,
207}
208
209impl TrigramIndex {
210    /// Create a new empty trigram index
211    pub fn new() -> Self {
212        Self {
213            index: Vec::new(),
214            files: Vec::new(),
215            temp_index: Some(HashMap::new()),
216            mmap: None,
217            directory: Vec::new(),
218            partial_indices: Vec::new(),
219            temp_dir: None,
220            max_posting_list_entries: 0,
221        }
222    }
223
224    /// Set maximum posting list entries per trigram (0 = unlimited).
225    pub fn set_max_posting_list_entries(&mut self, cap: usize) {
226        self.max_posting_list_entries = cap;
227    }
228
229    /// Enable batch-flush mode for large codebases
230    ///
231    /// Creates a temporary directory for partial indices that will be merged at finalize().
232    /// Call this before indexing to enable memory-efficient indexing for huge codebases.
233    pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
234        std::fs::create_dir_all(&temp_dir)
235            .context("Failed to create temp directory for batch flushing")?;
236        self.temp_dir = Some(temp_dir);
237        log::info!("Enabled batch-flush mode for trigram index");
238        Ok(())
239    }
240
241    /// Flush current temp_index to a partial index file
242    ///
243    /// This clears the in-memory HashMap and writes a sorted partial index to disk.
244    /// Called periodically during indexing to limit memory usage.
245    pub fn flush_batch(&mut self) -> Result<()> {
246        let temp_dir = self.temp_dir.as_ref()
247            .ok_or_else(|| anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first"))?;
248
249        // Take ownership of temp_index to finalize it
250        let temp_map = self.temp_index.take()
251            .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
252
253        if temp_map.is_empty() {
254            // Nothing to flush, restore empty map
255            self.temp_index = Some(HashMap::new());
256            return Ok(());
257        }
258
259        // Convert HashMap to sorted Vec
260        let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
261
262        // Sort and deduplicate posting lists
263        for (_, list) in partial_index.iter_mut() {
264            list.sort_unstable();
265            list.dedup();
266        }
267
268        // Sort by trigram
269        partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
270
271        // Write to temp file
272        let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
273        self.write_partial_index(&partial_file, &partial_index)?;
274
275        self.partial_indices.push(partial_file);
276
277        // Create new empty temp_index for next batch
278        self.temp_index = Some(HashMap::new());
279
280        log::debug!(
281            "Flushed batch {} with {} trigrams to disk",
282            self.partial_indices.len(),
283            partial_index.len()
284        );
285
286        Ok(())
287    }
288
289    /// Write a partial index to disk (simplified format for merging)
290    fn write_partial_index(
291        &self,
292        path: &Path,
293        index: &[(Trigram, Vec<FileLocation>)],
294    ) -> Result<()> {
295        use std::io::BufWriter;
296
297        let file = OpenOptions::new()
298            .create(true)
299            .write(true)
300            .truncate(true)
301            .open(path)?;
302
303        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
304
305        // Write number of trigrams
306        writer.write_all(&(index.len() as u64).to_le_bytes())?;
307
308        // Write each (trigram, posting_list)
309        for (trigram, locations) in index {
310            writer.write_all(&trigram.to_le_bytes())?;
311            writer.write_all(&(locations.len() as u32).to_le_bytes())?;
312
313            for loc in locations {
314                writer.write_all(&loc.file_id.to_le_bytes())?;
315                writer.write_all(&loc.line_no.to_le_bytes())?;
316                writer.write_all(&loc.byte_offset.to_le_bytes())?;
317            }
318        }
319
320        writer.flush()?;
321        Ok(())
322    }
323
324    /// Add a file to the index and return its file_id
325    pub fn add_file(&mut self, path: PathBuf) -> u32 {
326        let file_id = self.files.len() as u32;
327        self.files.push(path);
328        file_id
329    }
330
331    /// Get file path for a file_id
332    pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
333        self.files.get(file_id as usize)
334    }
335
336    /// Get total number of files
337    pub fn file_count(&self) -> usize {
338        self.files.len()
339    }
340
341    /// Get total number of unique trigrams
342    pub fn trigram_count(&self) -> usize {
343        if !self.directory.is_empty() {
344            // Lazy-loaded mode
345            self.directory.len()
346        } else {
347            // In-memory mode
348            self.index.len()
349        }
350    }
351
352    /// Index a file's content
353    ///
354    /// Extracts all trigrams from the content and adds them to the inverted index.
355    /// Must call finalize() after indexing all files to prepare for searching.
356    pub fn index_file(&mut self, file_id: u32, content: &str) {
357        let trigrams = extract_trigrams_with_locations(content, file_id);
358
359        // Use the persistent HashMap for O(1) updates during batch processing
360        if let Some(ref mut temp_map) = self.temp_index {
361            for (trigram, location) in trigrams {
362                temp_map
363                    .entry(trigram)
364                    .or_insert_with(Vec::new)
365                    .push(location);
366            }
367        } else {
368            panic!("Cannot call index_file() after finalize(). Index is read-only.");
369        }
370    }
371
372    /// Build index from a collection of pre-extracted trigrams (bulk operation)
373    ///
374    /// This is much more efficient than calling index_file() multiple times,
375    /// as it builds the HashMap once instead of rebuilding it for each file.
376    pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
377        let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
378
379        // Group trigrams into posting lists
380        for (trigram, location) in trigrams {
381            temp_map
382                .entry(trigram)
383                .or_insert_with(Vec::new)
384                .push(location);
385        }
386
387        // Convert to sorted Vec for binary search
388        self.index = temp_map.into_iter().collect();
389
390        // Clear temp_index since we're using the Vec directly
391        self.temp_index = None;
392
393        // Finalize immediately (sort and deduplicate)
394        self.finalize();
395    }
396
397    /// Finalize the index by sorting all posting lists and the index itself
398    ///
399    /// Must be called after all files are indexed, before querying.
400    /// Converts the HashMap to a sorted Vec for fast binary search.
401    ///
402    /// If batch flushing was enabled, finalization will be deferred until write()
403    /// is called, which will perform streaming merge directly to disk.
404    pub fn finalize(&mut self) {
405        // If we have partial indices from batch flushing, DON'T merge yet
406        // We'll do streaming merge in write() or write_with_streaming_merge()
407        if !self.partial_indices.is_empty() {
408            log::info!("Deferring finalization - will stream merge {} partial indices during write()",
409                       self.partial_indices.len());
410
411            // Flush final batch if temp_index is not empty
412            if let Some(ref temp_map) = self.temp_index {
413                if !temp_map.is_empty() {
414                    self.flush_batch().expect("Failed to flush final batch");
415                }
416            }
417
418            // Don't merge yet - write() will handle it
419            return;
420        }
421
422        // Standard finalization (no batch flushing)
423        // Convert HashMap to Vec if we have a temp index
424        if let Some(temp_map) = self.temp_index.take() {
425            self.index = temp_map.into_iter().collect();
426        }
427
428        // Sort, deduplicate, and cap posting lists
429        let cap = self.max_posting_list_entries;
430        for (trigram, list) in self.index.iter_mut() {
431            list.sort_unstable();
432            list.dedup(); // Remove duplicates (same trigram appearing multiple times on same line)
433            if cap > 0 && list.len() > cap {
434                log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, list.len(), cap);
435                list.truncate(cap);
436            }
437        }
438
439        // Sort the index by trigram for binary search
440        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
441    }
442
443    /// Merge all partial indices directly to trigrams.bin using streaming k-way merge
444    ///
445    /// This avoids loading the entire index into RAM by:
446    /// 1. Opening all partial index files as readers
447    /// 2. Performing k-way merge using a priority queue
448    /// 3. Writing compressed posting lists directly to disk
449    /// 4. Never accumulating more than K posting lists in memory at once
450    fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
451        use std::io::{BufReader, BufWriter, Read};
452        use std::cmp::Ordering;
453        use std::collections::BinaryHeap;
454
455        log::info!("Streaming merge of {} partial indices to {:?}",
456                   self.partial_indices.len(), output_path);
457
458        // Open all partial indices as buffered readers
459        struct PartialIndexReader {
460            reader: BufReader<File>,
461            current_trigram: Option<Trigram>,
462            current_posting_list: Vec<FileLocation>,
463            reader_id: usize,
464        }
465
466        let mut readers: Vec<PartialIndexReader> = Vec::new();
467
468        for (idx, partial_path) in self.partial_indices.iter().enumerate() {
469            let file = File::open(partial_path)
470                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
471            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
472
473            // Read number of trigrams (we don't need it for streaming merge)
474            let mut buf = [0u8; 8];
475            reader.read_exact(&mut buf)?;
476
477            readers.push(PartialIndexReader {
478                reader,
479                current_trigram: None,
480                current_posting_list: Vec::new(),
481                reader_id: idx,
482            });
483        }
484
485        // Helper to read next trigram from a reader
486        fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
487            // Try to read trigram
488            let mut trigram_buf = [0u8; 4];
489            match reader.reader.read_exact(&mut trigram_buf) {
490                Ok(_) => {
491                    let trigram = u32::from_le_bytes(trigram_buf);
492
493                    // Read posting list size
494                    let mut len_buf = [0u8; 4];
495                    reader.reader.read_exact(&mut len_buf)?;
496                    let list_len = u32::from_le_bytes(len_buf) as usize;
497
498                    // Read all locations for this trigram
499                    let mut locations = Vec::with_capacity(list_len);
500                    for _ in 0..list_len {
501                        let mut loc_buf = [0u8; 12];
502                        reader.reader.read_exact(&mut loc_buf)?;
503
504                        let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
505                        let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
506                        let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
507
508                        locations.push(FileLocation { file_id, line_no, byte_offset });
509                    }
510
511                    reader.current_trigram = Some(trigram);
512                    reader.current_posting_list = locations;
513                    Ok(true)
514                }
515                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
516                    reader.current_trigram = None;
517                    Ok(false)
518                }
519                Err(e) => Err(e.into()),
520            }
521        }
522
523        // Initialize: read first trigram from each reader
524        for reader in &mut readers {
525            read_next_trigram(reader)?;
526        }
527
528        // Priority queue entry for k-way merge
529        #[derive(Eq, PartialEq)]
530        struct HeapEntry {
531            trigram: Trigram,
532            reader_id: usize,
533        }
534
535        impl Ord for HeapEntry {
536            fn cmp(&self, other: &Self) -> Ordering {
537                // Reverse for min-heap
538                other.trigram.cmp(&self.trigram)
539                    .then_with(|| other.reader_id.cmp(&self.reader_id))
540            }
541        }
542
543        impl PartialOrd for HeapEntry {
544            fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
545                Some(self.cmp(other))
546            }
547        }
548
549        // Build initial heap
550        let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
551        for reader in &readers {
552            if let Some(trigram) = reader.current_trigram {
553                heap.push(HeapEntry {
554                    trigram,
555                    reader_id: reader.reader_id,
556                });
557            }
558        }
559
560        // Open output file for writing
561        let file = OpenOptions::new()
562            .create(true)
563            .write(true)
564            .truncate(true)
565            .open(output_path)
566            .with_context(|| format!("Failed to create {}", output_path.display()))?;
567
568        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
569
570        // Write placeholder header (we'll update it at the end)
571        writer.write_all(MAGIC)?;
572        writer.write_all(&VERSION.to_le_bytes())?;
573        writer.write_all(&0u64.to_le_bytes())?; // num_trigrams (placeholder)
574        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
575
576        // We'll build the directory as we go
577        let mut directory: Vec<DirectoryEntry> = Vec::new();
578        let mut num_trigrams = 0u64;
579
580        // K-way merge loop
581        let mut current_trigram: Option<Trigram> = None;
582        let mut merged_locations: Vec<FileLocation> = Vec::new();
583
584        while let Some(entry) = heap.pop() {
585            let reader = &mut readers[entry.reader_id];
586
587            // If this is a new trigram, write the previous one
588            if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
589                // Write the accumulated posting list for current_trigram
590                let trigram = current_trigram.unwrap();
591                merged_locations.sort_unstable();
592                merged_locations.dedup();
593
594                let cap = self.max_posting_list_entries;
595                if cap > 0 && merged_locations.len() > cap {
596                    log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, merged_locations.len(), cap);
597                    merged_locations.truncate(cap);
598                }
599
600                // Compress and write this trigram's posting list
601                let data_offset = writer.stream_position()?;
602                let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
603
604                directory.push(DirectoryEntry {
605                    trigram,
606                    data_offset,
607                    compressed_size,
608                });
609
610                num_trigrams += 1;
611                merged_locations.clear();
612            }
613
614            // Set current trigram
615            current_trigram = Some(entry.trigram);
616
617            // Merge this reader's posting list into accumulated list
618            merged_locations.extend_from_slice(&reader.current_posting_list);
619
620            // Advance this reader to next trigram
621            if read_next_trigram(reader)? {
622                if let Some(next_trigram) = reader.current_trigram {
623                    heap.push(HeapEntry {
624                        trigram: next_trigram,
625                        reader_id: entry.reader_id,
626                    });
627                }
628            }
629        }
630
631        // Write final trigram
632        if let Some(trigram) = current_trigram {
633            merged_locations.sort_unstable();
634            merged_locations.dedup();
635
636            let cap = self.max_posting_list_entries;
637            if cap > 0 && merged_locations.len() > cap {
638                log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, merged_locations.len(), cap);
639                merged_locations.truncate(cap);
640            }
641
642            let data_offset = writer.stream_position()?;
643            let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
644
645            directory.push(DirectoryEntry {
646                trigram,
647                data_offset,
648                compressed_size,
649            });
650
651            num_trigrams += 1;
652        }
653
654        log::info!("Merged {} trigrams from {} partial indices", num_trigrams, self.partial_indices.len());
655
656        // Remember where data section ended (not used but kept for clarity)
657        let _data_end_pos = writer.stream_position()?;
658
659        // Write file paths after data section
660        for file_path in &self.files {
661            let path_str = file_path.to_string_lossy();
662            let path_bytes = path_str.as_bytes();
663            write_varint(&mut writer, path_bytes.len() as u32)?;
664            writer.write_all(path_bytes)?;
665        }
666
667        // Flush before we rewrite the beginning
668        writer.flush()?;
669        drop(writer);
670
671        // Now we need to insert the directory at the beginning
672        // We'll read the data+files we just wrote, then rewrite the file with directory in between
673        use std::io::{Seek, SeekFrom};
674
675        // Read data and files sections
676        let mut temp_data = Vec::new();
677        {
678            let mut file = File::open(output_path)?;
679            file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
680            file.read_to_end(&mut temp_data)?;
681        }
682
683        // Rewrite file with correct structure
684        let file = OpenOptions::new().write(true).truncate(true).open(output_path)?;
685        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
686
687        // Write header with correct num_trigrams
688        writer.write_all(MAGIC)?;
689        writer.write_all(&VERSION.to_le_bytes())?;
690        writer.write_all(&num_trigrams.to_le_bytes())?;
691        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
692
693        // Write directory
694        for entry in &directory {
695            writer.write_all(&entry.trigram.to_le_bytes())?;
696            // Adjust data offset to account for directory size
697            let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
698            writer.write_all(&adjusted_offset.to_le_bytes())?;
699            writer.write_all(&entry.compressed_size.to_le_bytes())?;
700        }
701
702        // Write data and files sections
703        writer.write_all(&temp_data)?;
704
705        // Flush and sync
706        writer.flush()?;
707        writer.get_ref().sync_all()?;
708
709        // Clean up partial index files
710        for partial_path in &self.partial_indices {
711            let _ = std::fs::remove_file(partial_path);
712        }
713        if let Some(ref temp_dir) = self.temp_dir {
714            let _ = std::fs::remove_dir(temp_dir);
715        }
716
717        log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
718
719        Ok(())
720    }
721
722    /// Write a compressed posting list to the writer and return the compressed size
723    fn write_compressed_posting_list(
724        &self,
725        writer: &mut impl Write,
726        locations: &[FileLocation],
727    ) -> Result<u32> {
728        let mut compressed = Vec::new();
729
730        // Compress posting list using delta+varint encoding
731        let mut prev_file_id = 0u32;
732        let mut prev_line_no = 0u32;
733        let mut prev_byte_offset = 0u32;
734
735        for loc in locations {
736            // Compute deltas
737            let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
738            let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
739            let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
740
741            // Write deltas as varints
742            write_varint(&mut compressed, file_id_delta)?;
743            write_varint(&mut compressed, line_no_delta)?;
744            write_varint(&mut compressed, byte_offset_delta)?;
745
746            // Update previous values
747            prev_file_id = loc.file_id;
748            prev_line_no = loc.line_no;
749            prev_byte_offset = loc.byte_offset;
750        }
751
752        let compressed_size = compressed.len() as u32;
753        writer.write_all(&compressed)?;
754
755        Ok(compressed_size)
756    }
757
758    /// Merge all partial indices into self.index (old in-memory approach - deprecated)
759    #[allow(dead_code)]
760    fn merge_partial_indices(&mut self) -> Result<()> {
761        use std::io::{BufReader, Read};
762
763        // Read all partial indices into memory (simplified approach for now)
764        let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
765
766        for partial_path in &self.partial_indices {
767            let file = File::open(partial_path)
768                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
769            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
770
771            // Read number of trigrams
772            let mut buf = [0u8; 8];
773            reader.read_exact(&mut buf)?;
774            let num_trigrams = u64::from_le_bytes(buf) as usize;
775
776            // Read each (trigram, posting_list)
777            for _ in 0..num_trigrams {
778                // Read trigram
779                let mut trigram_buf = [0u8; 4];
780                reader.read_exact(&mut trigram_buf)?;
781                let trigram = u32::from_le_bytes(trigram_buf);
782
783                // Read posting list size
784                let mut len_buf = [0u8; 4];
785                reader.read_exact(&mut len_buf)?;
786                let list_len = u32::from_le_bytes(len_buf) as usize;
787
788                // Read all locations
789                for _ in 0..list_len {
790                    let mut loc_buf = [0u8; 12]; // 3 * u32
791                    reader.read_exact(&mut loc_buf)?;
792
793                    let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
794                    let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
795                    let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
796
797                    all_entries.push((trigram, FileLocation { file_id, line_no, byte_offset }));
798                }
799            }
800        }
801
802        log::info!("Read {} total trigram entries from {} partial indices",
803                   all_entries.len(), self.partial_indices.len());
804
805        // Group by trigram
806        let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
807        for (trigram, location) in all_entries {
808            index_map
809                .entry(trigram)
810                .or_insert_with(Vec::new)
811                .push(location);
812        }
813
814        // Convert to sorted vec
815        self.index = index_map.into_iter().collect();
816
817        // Sort and deduplicate posting lists
818        for (_, list) in self.index.iter_mut() {
819            list.sort_unstable();
820            list.dedup();
821        }
822
823        // Sort by trigram
824        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
825
826        // Clean up partial index files
827        for partial_path in &self.partial_indices {
828            let _ = std::fs::remove_file(partial_path);
829        }
830        if let Some(ref temp_dir) = self.temp_dir {
831            let _ = std::fs::remove_dir(temp_dir);
832        }
833
834        log::info!("Merged into final index with {} trigrams", self.index.len());
835
836        Ok(())
837    }
838
839    /// Search for a plain text pattern
840    ///
841    /// Returns candidate file locations that could contain the pattern.
842    /// Caller must verify actual matches.
843    ///
844    /// In lazy-loaded mode: Decompresses posting lists on-demand from mmap.
845    /// In in-memory mode: Uses pre-loaded posting lists.
846    pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
847        if pattern.len() < 3 {
848            // Pattern too short for trigrams - caller must fall back to full scan
849            return vec![];
850        }
851
852        let trigrams = extract_trigrams(pattern);
853        if trigrams.is_empty() {
854            return vec![];
855        }
856
857        // Check if we're in lazy-loaded mode or in-memory mode
858        if let Some(ref mmap) = self.mmap {
859            // Lazy-loaded mode: decompress posting lists on-demand
860            let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
861
862            for trigram in &trigrams {
863                // Binary search directory for this trigram
864                match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
865                    Ok(idx) => {
866                        let entry = &self.directory[idx];
867                        // Decompress this posting list on-demand
868                        match decompress_posting_list(mmap, entry.data_offset, entry.compressed_size) {
869                            Ok(locations) => posting_lists.push(locations),
870                            Err(e) => {
871                                log::warn!("Failed to decompress posting list for trigram {}: {}", trigram, e);
872                                return vec![];
873                            }
874                        }
875                    }
876                    Err(_) => {
877                        // Trigram not found - pattern cannot match
878                        return vec![];
879                    }
880                }
881            }
882
883            if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
884                return vec![];
885            }
886
887            // Sort by list size (smallest first for efficient intersection)
888            posting_lists.sort_by_key(|list| list.len());
889
890            // Intersect posting lists (owned version)
891            intersect_by_file_owned(&posting_lists)
892        } else {
893            // In-memory mode: use pre-loaded index
894            let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
895                .iter()
896                .filter_map(|t| {
897                    self.index
898                        .binary_search_by_key(t, |(trigram, _)| *trigram)
899                        .ok()
900                        .map(|idx| &self.index[idx].1)
901                })
902                .collect();
903
904            if posting_lists.is_empty() {
905                return vec![];
906            }
907
908            if posting_lists.len() < trigrams.len() {
909                // Some trigrams missing - pattern cannot match
910                return vec![];
911            }
912
913            // Sort by list size (smallest first for efficient intersection)
914            posting_lists.sort_by_key(|list| list.len());
915
916            // Intersect posting lists (reference version)
917            intersect_by_file(&posting_lists)
918        }
919    }
920
921    /// Get posting list for a specific trigram (for debugging)
922    pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
923        self.index
924            .binary_search_by_key(&trigram, |(t, _)| *t)
925            .ok()
926            .map(|idx| &self.index[idx].1)
927    }
928
929    /// Write the trigram index to disk
930    ///
931    /// Binary format V3 (lazy-loadable with directory + data separation):
932    /// - Header (24 bytes): magic, version, num_trigrams, num_files
933    /// - Directory Section (16 bytes per trigram):
934    ///   - trigram: u32 (4 bytes)
935    ///   - data_offset: u64 (8 bytes) - absolute offset in file
936    ///   - compressed_size: u32 (4 bytes) - size of compressed posting list
937    /// - Data Section (variable size):
938    ///   - Compressed posting lists (delta+varint encoded)
939    /// - File Paths Section (variable size):
940    ///   - path_len: varint
941    ///   - path_bytes: [u8; path_len]
942    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
943        let path = path.as_ref();
944
945        // If we have partial indices from batch flushing, use streaming merge
946        if !self.partial_indices.is_empty() {
947            log::info!("Using streaming merge to write {} partial indices", self.partial_indices.len());
948            return self.merge_partial_indices_to_file(path);
949        }
950
951        // Standard write path (no batch flushing).
952        // Non-atomic: writes directly to the target path with truncate(true).
953        // On disk-full mid-write the file is left corrupt; the indexer fast-path
954        // validates the "RFTG" magic bytes on re-index, forcing a clean rebuild.
955        let file = OpenOptions::new()
956            .create(true)
957            .write(true)
958            .truncate(true)
959            .open(path)
960            .with_context(|| format!("Failed to create {}", path.display()))?;
961
962        // Use a large buffer (16MB) for streaming writes
963        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
964
965        // Write header
966        writer.write_all(MAGIC)?;
967        writer.write_all(&VERSION.to_le_bytes())?;
968        writer.write_all(&(self.index.len() as u64).to_le_bytes())?; // num_trigrams
969        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
970
971        // Build directory and write compressed data in a single pass
972        let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
973
974        // Calculate directory start and size
975        let directory_start = HEADER_SIZE as u64;
976        let directory_size = self.index.len() * 16;
977
978        // Reserve space for directory (we'll write it after data)
979        let data_start = directory_start + directory_size as u64;
980        let mut current_offset = data_start;
981
982        // We need to write in the correct order: header, directory, data, file paths
983        // But we need data offsets to write directory
984        // So we compress data first, then write header+directory+data
985
986        // Step 1: Compress all posting lists and track offsets
987        let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
988
989        for (trigram, locations) in &self.index {
990            // Compress the posting list
991            let mut compressed = Vec::new();
992            let mut prev_file_id = 0u32;
993            let mut prev_line_no = 0u32;
994            let mut prev_byte_offset = 0u32;
995
996            for loc in locations {
997                let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
998                let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
999                let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
1000
1001                write_varint(&mut compressed, file_id_delta)?;
1002                write_varint(&mut compressed, line_no_delta)?;
1003                write_varint(&mut compressed, byte_offset_delta)?;
1004
1005                prev_file_id = loc.file_id;
1006                prev_line_no = loc.line_no;
1007                prev_byte_offset = loc.byte_offset;
1008            }
1009
1010            directory.push(DirectoryEntry {
1011                trigram: *trigram,
1012                data_offset: current_offset,
1013                compressed_size: compressed.len() as u32,
1014            });
1015            current_offset += compressed.len() as u64;
1016
1017            compressed_lists.push((*trigram, compressed));
1018        }
1019
1020        // Step 2: Write directory
1021        for entry in &directory {
1022            writer.write_all(&entry.trigram.to_le_bytes())?;
1023            writer.write_all(&entry.data_offset.to_le_bytes())?;
1024            writer.write_all(&entry.compressed_size.to_le_bytes())?;
1025        }
1026
1027        // Step 3: Write data section (compressed posting lists)
1028        for (_, compressed) in &compressed_lists {
1029            writer.write_all(compressed)?;
1030        }
1031
1032        // Step 4: Write file paths
1033        for file_path in &self.files {
1034            let path_str = file_path.to_string_lossy();
1035            let path_bytes = path_str.as_bytes();
1036            write_varint(&mut writer, path_bytes.len() as u32)?;
1037            writer.write_all(path_bytes)?;
1038        }
1039
1040        // Flush and sync
1041        writer.flush()?;
1042        writer.get_ref().sync_all()?;
1043
1044        log::info!(
1045            "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1046            self.index.len(),
1047            self.files.len(),
1048            path
1049        );
1050
1051        Ok(())
1052    }
1053
1054    /// Load trigram index from disk using memory-mapped I/O with lazy loading
1055    ///
1056    /// Binary format V3: Only reads the directory and file paths, keeps posting lists compressed in mmap.
1057    /// Posting lists are decompressed on-demand during search queries.
1058    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1059        let path = path.as_ref();
1060
1061        let file = File::open(path)
1062            .with_context(|| format!("Failed to open {}", path.display()))?;
1063
1064        // Memory-map the file (keep it alive for lazy access)
1065        let mmap = unsafe {
1066            memmap2::Mmap::map(&file)
1067                .with_context(|| format!("Failed to mmap {}", path.display()))?
1068        };
1069
1070        // Validate header
1071        if mmap.len() < HEADER_SIZE {
1072            anyhow::bail!("trigrams.bin too small (expected at least {} bytes)", HEADER_SIZE);
1073        }
1074
1075        if &mmap[0..4] != MAGIC {
1076            anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1077        }
1078
1079        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1080        if version != VERSION {
1081            anyhow::bail!(
1082                "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1083                version, VERSION
1084            );
1085        }
1086
1087        let num_trigrams = u64::from_le_bytes([
1088            mmap[8], mmap[9], mmap[10], mmap[11],
1089            mmap[12], mmap[13], mmap[14], mmap[15],
1090        ]) as usize;
1091
1092        let num_files = u64::from_le_bytes([
1093            mmap[16], mmap[17], mmap[18], mmap[19],
1094            mmap[20], mmap[21], mmap[22], mmap[23],
1095        ]) as usize;
1096
1097        log::debug!("Loading lazy trigram index: {} trigrams, {} files", num_trigrams, num_files);
1098
1099        // Read directory (trigram → offset mappings) - fast, just metadata
1100        let mut directory = Vec::with_capacity(num_trigrams);
1101        let mut pos = HEADER_SIZE;
1102        let directory_size = num_trigrams * 16; // 16 bytes per entry
1103
1104        for _ in 0..num_trigrams {
1105            if pos + 16 > mmap.len() {
1106                anyhow::bail!("Truncated directory entry at pos={}", pos);
1107            }
1108
1109            let trigram = u32::from_le_bytes([
1110                mmap[pos],
1111                mmap[pos + 1],
1112                mmap[pos + 2],
1113                mmap[pos + 3],
1114            ]);
1115            pos += 4;
1116
1117            let data_offset = u64::from_le_bytes([
1118                mmap[pos],
1119                mmap[pos + 1],
1120                mmap[pos + 2],
1121                mmap[pos + 3],
1122                mmap[pos + 4],
1123                mmap[pos + 5],
1124                mmap[pos + 6],
1125                mmap[pos + 7],
1126            ]);
1127            pos += 8;
1128
1129            let compressed_size = u32::from_le_bytes([
1130                mmap[pos],
1131                mmap[pos + 1],
1132                mmap[pos + 2],
1133                mmap[pos + 3],
1134            ]);
1135            pos += 4;
1136
1137            directory.push(DirectoryEntry {
1138                trigram,
1139                data_offset,
1140                compressed_size,
1141            });
1142        }
1143
1144        // Directory is already sorted by trigram (from write())
1145        directory.sort_unstable_by_key(|e| e.trigram);
1146
1147        // Calculate where file paths section starts (after header + directory + data)
1148        let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1149        let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1150        pos = files_section_offset;
1151
1152        // Read file paths (varint-encoded lengths)
1153        let mut files = Vec::with_capacity(num_files);
1154        for _ in 0..num_files {
1155            // Read path length (varint)
1156            let (path_len, consumed) = read_varint(&mmap[pos..])?;
1157            pos += consumed;
1158            let path_len = path_len as usize;
1159
1160            if pos + path_len > mmap.len() {
1161                anyhow::bail!("Truncated file path at pos={}", pos);
1162            }
1163
1164            let path_bytes = &mmap[pos..pos + path_len];
1165            let path_str = std::str::from_utf8(path_bytes)
1166                .context("Invalid UTF-8 in file path")?;
1167            files.push(PathBuf::from(path_str));
1168            pos += path_len;
1169        }
1170
1171        log::info!(
1172            "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1173            num_trigrams,
1174            num_files,
1175            directory_size / 1024
1176        );
1177
1178        Ok(Self {
1179            index: Vec::new(), // Empty in lazy mode
1180            files,
1181            temp_index: None,
1182            mmap: Some(mmap), // Keep mmap alive for lazy decompression!
1183            directory,
1184            partial_indices: Vec::new(),
1185            temp_dir: None,
1186            max_posting_list_entries: 0,
1187        })
1188    }
1189}
1190
1191impl Default for TrigramIndex {
1192    fn default() -> Self {
1193        Self::new()
1194    }
1195}
1196
1197/// Extract all trigrams from text
1198///
1199/// Returns a vector of trigrams (without location info).
1200pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1201    let bytes = text.as_bytes();
1202    let mut trigrams = Vec::new();
1203
1204    for i in 0..bytes.len().saturating_sub(2) {
1205        let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1206        trigrams.push(trigram);
1207    }
1208
1209    trigrams
1210}
1211
1212/// Extract trigrams with file location information
1213///
1214/// Returns a vector of (trigram, location) pairs for building the inverted index.
1215pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1216    let bytes = text.as_bytes();
1217    let mut result = Vec::new();
1218
1219    let mut line_no = 1;
1220
1221    for (i, &byte) in bytes.iter().enumerate() {
1222        // Track newlines
1223        if byte == b'\n' {
1224            line_no += 1;
1225        }
1226
1227        // Extract trigram
1228        if i + 2 < bytes.len() {
1229            let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1230            let location = FileLocation::new(file_id, line_no, i as u32);
1231            result.push((trigram, location));
1232        }
1233    }
1234
1235    result
1236}
1237
1238/// Convert 3 bytes to a trigram (packed u32)
1239#[inline]
1240fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1241    debug_assert_eq!(bytes.len(), 3);
1242    (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1243}
1244
1245/// Convert trigram back to bytes (for debugging)
1246#[allow(dead_code)]
1247fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1248    [
1249        ((trigram >> 16) & 0xFF) as u8,
1250        ((trigram >> 8) & 0xFF) as u8,
1251        (trigram & 0xFF) as u8,
1252    ]
1253}
1254
1255/// Intersect posting lists by (file_id, line_no) pairs
1256///
1257/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1258/// This ensures accurate full-text matching.
1259fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1260    if lists.is_empty() {
1261        return vec![];
1262    }
1263
1264    use std::collections::HashSet;
1265
1266    // Create a set of (file_id, line_no) pairs from the first list
1267    let mut candidates: HashSet<(u32, u32)> = lists[0]
1268        .iter()
1269        .map(|loc| (loc.file_id, loc.line_no))
1270        .collect();
1271
1272    // Intersect with (file_id, line_no) pairs from other lists
1273    for &list in &lists[1..] {
1274        let list_pairs: HashSet<(u32, u32)> = list
1275            .iter()
1276            .map(|loc| (loc.file_id, loc.line_no))
1277            .collect();
1278        candidates.retain(|pair| list_pairs.contains(pair));
1279    }
1280
1281    // Convert back to FileLocation results
1282    let mut result = Vec::new();
1283    for &(file_id, line_no) in &candidates {
1284        // Find a location matching this (file_id, line_no) from the first list
1285        if let Some(loc) = lists[0]
1286            .iter()
1287            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1288        {
1289            result.push(*loc);
1290        }
1291    }
1292
1293    result.sort_unstable();
1294    result
1295}
1296
1297/// Intersect posting lists by (file_id, line_no) pairs (owned version for lazy-loading)
1298///
1299/// Similar to intersect_by_file() but works with owned Vec<Vec<FileLocation>>
1300/// instead of references. Used in lazy-loading mode where posting lists are decompressed on-demand.
1301///
1302/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1303fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1304    if lists.is_empty() {
1305        return vec![];
1306    }
1307
1308    use std::collections::HashSet;
1309
1310    // Create a set of (file_id, line_no) pairs from the first list
1311    let mut candidates: HashSet<(u32, u32)> = lists[0]
1312        .iter()
1313        .map(|loc| (loc.file_id, loc.line_no))
1314        .collect();
1315
1316    // Intersect with (file_id, line_no) pairs from other lists
1317    for list in &lists[1..] {
1318        let list_pairs: HashSet<(u32, u32)> = list
1319            .iter()
1320            .map(|loc| (loc.file_id, loc.line_no))
1321            .collect();
1322        candidates.retain(|pair| list_pairs.contains(pair));
1323    }
1324
1325    // Convert back to FileLocation results
1326    let mut result = Vec::new();
1327    for &(file_id, line_no) in &candidates {
1328        // Find a location matching this (file_id, line_no) from the first list
1329        if let Some(loc) = lists[0]
1330            .iter()
1331            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1332        {
1333            result.push(*loc);
1334        }
1335    }
1336
1337    result.sort_unstable();
1338    result
1339}
1340
1341#[cfg(test)]
1342mod tests {
1343    use super::*;
1344
1345    #[test]
1346    fn test_extract_trigrams() {
1347        let text = "hello";
1348        let trigrams = extract_trigrams(text);
1349
1350        // "hello" → "hel", "ell", "llo"
1351        assert_eq!(trigrams.len(), 3);
1352
1353        // Verify trigrams are unique
1354        let expected = vec![
1355            bytes_to_trigram(b"hel"),
1356            bytes_to_trigram(b"ell"),
1357            bytes_to_trigram(b"llo"),
1358        ];
1359        assert_eq!(trigrams, expected);
1360    }
1361
1362    #[test]
1363    fn test_extract_trigrams_short() {
1364        assert_eq!(extract_trigrams("ab").len(), 0);
1365        assert_eq!(extract_trigrams("abc").len(), 1);
1366    }
1367
1368    #[test]
1369    fn test_bytes_to_trigram() {
1370        let trigram1 = bytes_to_trigram(b"abc");
1371        let trigram2 = bytes_to_trigram(b"abc");
1372        let trigram3 = bytes_to_trigram(b"xyz");
1373
1374        assert_eq!(trigram1, trigram2);
1375        assert_ne!(trigram1, trigram3);
1376    }
1377
1378    #[test]
1379    fn test_trigram_roundtrip() {
1380        let original = b"foo";
1381        let trigram = bytes_to_trigram(original);
1382        let recovered = trigram_to_bytes(trigram);
1383        assert_eq!(original, &recovered);
1384    }
1385
1386    #[test]
1387    fn test_extract_with_locations() {
1388        let text = "hello\nworld";
1389        let locs = extract_trigrams_with_locations(text, 0);
1390
1391        // "hello\nworld" has 9 trigrams:
1392        // "hel", "ell", "llo", "lo\n", "o\nw", "\nwo", "wor", "orl", "rld"
1393        assert_eq!(locs.len(), 9);
1394
1395        // First trigram should be on line 1
1396        assert_eq!(locs[0].1.line_no, 1);
1397
1398        // After newline, should be line 2
1399        let world_start = text.find("world").unwrap();
1400        let world_trigram_idx = locs
1401            .iter()
1402            .position(|(_, loc)| loc.byte_offset as usize == world_start)
1403            .unwrap();
1404        assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1405    }
1406
1407    #[test]
1408    fn test_trigram_index_basic() {
1409        let mut index = TrigramIndex::new();
1410
1411        let file_id = index.add_file(PathBuf::from("test.txt"));
1412        index.index_file(file_id, "hello world");
1413        index.finalize();
1414
1415        // Search for "hello"
1416        let results = index.search("hello");
1417        assert!(!results.is_empty());
1418
1419        // Search for "world"
1420        let results = index.search("world");
1421        assert!(!results.is_empty());
1422
1423        // Search for "goodbye" (not in text)
1424        let results = index.search("goodbye");
1425        assert!(results.is_empty());
1426    }
1427
1428    #[test]
1429    fn test_search_multifile() {
1430        let mut index = TrigramIndex::new();
1431
1432        let file1 = index.add_file(PathBuf::from("file1.txt"));
1433        let file2 = index.add_file(PathBuf::from("file2.txt"));
1434
1435        index.index_file(file1, "extract_symbols is here");
1436        index.index_file(file2, "extract_symbols is also here");
1437        index.finalize();
1438
1439        let results = index.search("extract_symbols");
1440        assert_eq!(results.len(), 2); // One result per file
1441
1442        // Verify we got both files
1443        let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1444        assert!(file_ids.contains(&file1));
1445        assert!(file_ids.contains(&file2));
1446    }
1447
1448    #[test]
1449    fn test_persistence_write() {
1450        use tempfile::TempDir;
1451
1452        let temp = TempDir::new().unwrap();
1453        let trigrams_path = temp.path().join("trigrams.bin");
1454
1455        // Build and write index
1456        let mut index = TrigramIndex::new();
1457        let file1 = index.add_file(PathBuf::from("src/main.rs"));
1458        let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1459
1460        index.index_file(file1, "fn main() { println!(\"hello\"); }");
1461        index.index_file(file2, "pub fn hello() -> String { String::from(\"hello\") }");
1462        index.finalize();
1463
1464        // Write to disk
1465        index.write(&trigrams_path).unwrap();
1466
1467        // Verify file was created
1468        assert!(trigrams_path.exists());
1469
1470        // Verify file has content (header + data)
1471        let metadata = std::fs::metadata(&trigrams_path).unwrap();
1472        assert!(metadata.len() > HEADER_SIZE as u64);
1473
1474        // Verify we can read the header back
1475        use std::io::Read;
1476        let mut file = File::open(&trigrams_path).unwrap();
1477        let mut magic = [0u8; 4];
1478        file.read_exact(&mut magic).unwrap();
1479        assert_eq!(&magic, MAGIC);
1480
1481        // Note: Full roundtrip test verifies write works correctly.
1482        // Load verification is tested in production via query performance tests.
1483    }
1484
1485    #[test]
1486    fn test_posting_list_cap_enforced() {
1487        let cap: usize = 10;
1488        let content = "aaa ".repeat(200);
1489        let mut index = TrigramIndex::new();
1490        index.set_max_posting_list_entries(cap);
1491        let file_id = index.add_file(PathBuf::from("dense.txt"));
1492        index.index_file(file_id, &content);
1493        index.finalize();
1494        let aaa = bytes_to_trigram(b"aaa");
1495        let list = index.get_posting_list(aaa).expect("aaa trigram should exist");
1496        assert!(list.len() <= cap, "cap exceeded: {} > {}", list.len(), cap);
1497    }
1498
1499    #[test]
1500    fn test_posting_list_cap_zero_means_unlimited() {
1501        let repetitions = 50;
1502        let content = "aaa ".repeat(repetitions);
1503        let mut index = TrigramIndex::new();
1504        index.set_max_posting_list_entries(0);
1505        let file_id = index.add_file(PathBuf::from("dense.txt"));
1506        index.index_file(file_id, &content);
1507        index.finalize();
1508        let aaa = bytes_to_trigram(b"aaa");
1509        let list = index.get_posting_list(aaa).expect("aaa trigram should exist");
1510        assert!(list.len() >= repetitions, "expected >= {} entries, got {}", repetitions, list.len());
1511    }
1512}