Skip to main content

reflex/
trigram.rs

1//! Trigram-based inverted index for fast full-text code search
2//!
3//! This module implements the core trigram indexing algorithm used by Reflex.
4//! A trigram is a sequence of 3 consecutive bytes. By building an inverted index
5//! mapping trigrams to file locations, we can quickly narrow down search candidates
6//! and achieve sub-100ms query times even on large codebases.
7//!
8//! # Algorithm
9//!
10//! 1. **Indexing**: Extract all trigrams from each file, store locations
11//! 2. **Querying**: Extract trigrams from query, intersect posting lists
12//! 3. **Verification**: Check actual matches at candidate locations
13//!
14//! See `.context/TRIGRAM_RESEARCH.md` for detailed algorithm documentation.
15
16use anyhow::{Context, Result};
17use std::collections::HashMap;
18use std::fs::{File, OpenOptions};
19use std::io::Write;
20use std::path::{Path, PathBuf};
21
22/// A trigram is 3 consecutive bytes, packed into a u32 for efficient hashing
23pub type Trigram = u32;
24
25// Binary format constants for trigrams.bin
26const MAGIC: &[u8; 4] = b"RFTG"; // ReFlex TriGrams
27const VERSION: u32 = 3; // V3: No filtering, lazy loading with directory + data separation
28// Header: magic(4) + version(4) + num_trigrams(8) + num_files(8) = 24 bytes
29#[allow(dead_code)]
30const HEADER_SIZE: usize = 24;
31
32/// Write a u32 as a varint (variable-length integer)
33/// Uses 1-5 bytes depending on magnitude (smaller numbers = fewer bytes)
34fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
35    loop {
36        let mut byte = (value & 0x7F) as u8;
37        value >>= 7;
38        if value != 0 {
39            byte |= 0x80; // Set continuation bit
40        }
41        writer.write_all(&[byte])?;
42        if value == 0 {
43            break;
44        }
45    }
46    Ok(())
47}
48
49/// Read a varint from a byte slice, returns (value, bytes_consumed)
50fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
51    let mut value: u32 = 0;
52    let mut shift = 0;
53    let mut pos = 0;
54
55    loop {
56        if pos >= data.len() {
57            anyhow::bail!("Truncated varint");
58        }
59        let byte = data[pos];
60        pos += 1;
61
62        value |= ((byte & 0x7F) as u32) << shift;
63        if byte & 0x80 == 0 {
64            break;
65        }
66        shift += 7;
67        if shift >= 32 {
68            anyhow::bail!("Varint too large");
69        }
70    }
71
72    Ok((value, pos))
73}
74
75/// Decompress a posting list from memory-mapped data
76///
77/// Reads a compressed posting list (delta+varint encoded) from the given offset
78/// and decompresses it into a Vec<FileLocation>.
79///
80/// # Arguments
81/// * `mmap` - Memory-mapped file data
82/// * `offset` - Absolute byte offset where compressed data starts
83/// * `size` - Number of bytes to read
84fn decompress_posting_list(mmap: &[u8], offset: u64, size: u32) -> Result<Vec<FileLocation>> {
85    let start = offset as usize;
86    let end = start + size as usize;
87
88    if end > mmap.len() {
89        anyhow::bail!(
90            "Posting list out of bounds: offset={}, size={}, mmap_len={}",
91            offset,
92            size,
93            mmap.len()
94        );
95    }
96
97    let compressed_data = &mmap[start..end];
98
99    // Decompress delta-encoded posting list
100    let mut locations = Vec::new();
101    let mut pos = 0;
102    let mut prev_file_id = 0u32;
103    let mut prev_line_no = 0u32;
104    let mut prev_byte_offset = 0u32;
105
106    while pos < compressed_data.len() {
107        // Read file_id delta
108        let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
109        pos += consumed;
110
111        // Read line_no delta
112        let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
113        pos += consumed;
114
115        // Read byte_offset delta
116        let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
117        pos += consumed;
118
119        // Reconstruct absolute values from deltas
120        let file_id = prev_file_id.wrapping_add(file_id_delta);
121        let line_no = prev_line_no.wrapping_add(line_no_delta);
122        let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
123
124        locations.push(FileLocation {
125            file_id,
126            line_no,
127            byte_offset,
128        });
129
130        // Update previous values for next delta
131        prev_file_id = file_id;
132        prev_line_no = line_no;
133        prev_byte_offset = byte_offset;
134    }
135
136    Ok(locations)
137}
138
139/// Location of a trigram occurrence in the codebase
140#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
141pub struct FileLocation {
142    /// File ID (index into file list)
143    pub file_id: u32,
144    /// Line number (1-indexed)
145    pub line_no: u32,
146    /// Byte offset in file (for context extraction)
147    pub byte_offset: u32,
148}
149
150impl FileLocation {
151    pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
152        Self {
153            file_id,
154            line_no,
155            byte_offset,
156        }
157    }
158}
159
160/// Directory entry for lazy-loaded trigram index
161///
162/// Maps each trigram to its compressed posting list location in the data section.
163/// Total size: 16 bytes per entry (4 + 8 + 4)
164#[derive(Debug, Clone)]
165struct DirectoryEntry {
166    /// The trigram value (for binary search)
167    trigram: Trigram,
168    /// Absolute byte offset in the file where compressed data starts
169    data_offset: u64,
170    /// Size of compressed posting list in bytes
171    compressed_size: u32,
172}
173
174/// Trigram-based inverted index
175///
176/// Maps each trigram to a sorted list of locations where it appears.
177/// Posting lists are kept sorted by (file_id, line_no) for efficient intersection.
178/// The index itself is kept sorted by trigram for O(log n) binary search.
179///
180/// Supports three modes:
181/// 1. **In-memory mode** (during indexing): All posting lists in RAM
182/// 2. **Batch-flush mode** (large codebases): Periodically flushes partial indices to disk to limit RAM
183/// 3. **Lazy-loaded mode** (after loading): Compressed posting lists in mmap, decompressed on-demand
184pub struct TrigramIndex {
185    /// Inverted index: sorted Vec of (trigram, locations) for binary search
186    /// Used in in-memory mode (during indexing)
187    index: Vec<(Trigram, Vec<FileLocation>)>,
188    /// File ID to file path mapping
189    files: Vec<PathBuf>,
190    /// Temporary HashMap used during batch indexing (None when finalized)
191    temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
192    /// Memory-mapped index file (for lazy loading)
193    mmap: Option<memmap2::Mmap>,
194    /// Directory of (trigram, offset, size) for lazy loading
195    directory: Vec<DirectoryEntry>,
196    /// Partial index files created during batch flushing (for k-way merge at finalize)
197    partial_indices: Vec<PathBuf>,
198    /// Temporary directory for partial indices
199    temp_dir: Option<PathBuf>,
200    /// Cap on posting list size; 0 = unlimited. Enforced at finalize time.
201    /// Bounds query latency for high-frequency trigrams (trigram-density lens).
202    max_posting_list_entries: usize,
203}
204
205impl TrigramIndex {
206    /// Create a new empty trigram index
207    pub fn new() -> Self {
208        Self {
209            index: Vec::new(),
210            files: Vec::new(),
211            temp_index: Some(HashMap::new()),
212            mmap: None,
213            directory: Vec::new(),
214            partial_indices: Vec::new(),
215            temp_dir: None,
216            max_posting_list_entries: 0,
217        }
218    }
219
220    /// Set maximum posting list entries per trigram (0 = unlimited).
221    pub fn set_max_posting_list_entries(&mut self, cap: usize) {
222        self.max_posting_list_entries = cap;
223    }
224
225    /// Enable batch-flush mode for large codebases
226    ///
227    /// Creates a temporary directory for partial indices that will be merged at finalize().
228    /// Call this before indexing to enable memory-efficient indexing for huge codebases.
229    pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
230        std::fs::create_dir_all(&temp_dir)
231            .context("Failed to create temp directory for batch flushing")?;
232        self.temp_dir = Some(temp_dir);
233        log::info!("Enabled batch-flush mode for trigram index");
234        Ok(())
235    }
236
237    /// Flush current temp_index to a partial index file
238    ///
239    /// This clears the in-memory HashMap and writes a sorted partial index to disk.
240    /// Called periodically during indexing to limit memory usage.
241    pub fn flush_batch(&mut self) -> Result<()> {
242        let temp_dir = self.temp_dir.as_ref().ok_or_else(|| {
243            anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first")
244        })?;
245
246        // Take ownership of temp_index to finalize it
247        let temp_map = self
248            .temp_index
249            .take()
250            .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
251
252        if temp_map.is_empty() {
253            // Nothing to flush, restore empty map
254            self.temp_index = Some(HashMap::new());
255            return Ok(());
256        }
257
258        // Convert HashMap to sorted Vec
259        let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
260
261        // Sort and deduplicate posting lists
262        for (_, list) in partial_index.iter_mut() {
263            list.sort_unstable();
264            list.dedup();
265        }
266
267        // Sort by trigram
268        partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
269
270        // Write to temp file
271        let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
272        self.write_partial_index(&partial_file, &partial_index)?;
273
274        self.partial_indices.push(partial_file);
275
276        // Create new empty temp_index for next batch
277        self.temp_index = Some(HashMap::new());
278
279        log::debug!(
280            "Flushed batch {} with {} trigrams to disk",
281            self.partial_indices.len(),
282            partial_index.len()
283        );
284
285        Ok(())
286    }
287
288    /// Write a partial index to disk (simplified format for merging)
289    fn write_partial_index(
290        &self,
291        path: &Path,
292        index: &[(Trigram, Vec<FileLocation>)],
293    ) -> Result<()> {
294        use std::io::BufWriter;
295
296        let file = OpenOptions::new()
297            .create(true)
298            .write(true)
299            .truncate(true)
300            .open(path)?;
301
302        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
303
304        // Write number of trigrams
305        writer.write_all(&(index.len() as u64).to_le_bytes())?;
306
307        // Write each (trigram, posting_list)
308        for (trigram, locations) in index {
309            writer.write_all(&trigram.to_le_bytes())?;
310            writer.write_all(&(locations.len() as u32).to_le_bytes())?;
311
312            for loc in locations {
313                writer.write_all(&loc.file_id.to_le_bytes())?;
314                writer.write_all(&loc.line_no.to_le_bytes())?;
315                writer.write_all(&loc.byte_offset.to_le_bytes())?;
316            }
317        }
318
319        writer.flush()?;
320        Ok(())
321    }
322
323    /// Add a file to the index and return its file_id
324    pub fn add_file(&mut self, path: PathBuf) -> u32 {
325        let file_id = self.files.len() as u32;
326        self.files.push(path);
327        file_id
328    }
329
330    /// Get file path for a file_id
331    pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
332        self.files.get(file_id as usize)
333    }
334
335    /// Get total number of files
336    pub fn file_count(&self) -> usize {
337        self.files.len()
338    }
339
340    /// Get total number of unique trigrams
341    pub fn trigram_count(&self) -> usize {
342        if !self.directory.is_empty() {
343            // Lazy-loaded mode
344            self.directory.len()
345        } else {
346            // In-memory mode
347            self.index.len()
348        }
349    }
350
351    /// Index a file's content
352    ///
353    /// Extracts all trigrams from the content and adds them to the inverted index.
354    /// Must call finalize() after indexing all files to prepare for searching.
355    pub fn index_file(&mut self, file_id: u32, content: &str) {
356        let trigrams = extract_trigrams_with_locations(content, file_id);
357
358        // Use the persistent HashMap for O(1) updates during batch processing
359        if let Some(ref mut temp_map) = self.temp_index {
360            for (trigram, location) in trigrams {
361                temp_map
362                    .entry(trigram)
363                    .or_insert_with(Vec::new)
364                    .push(location);
365            }
366        } else {
367            panic!("Cannot call index_file() after finalize(). Index is read-only.");
368        }
369    }
370
371    /// Build index from a collection of pre-extracted trigrams (bulk operation)
372    ///
373    /// This is much more efficient than calling index_file() multiple times,
374    /// as it builds the HashMap once instead of rebuilding it for each file.
375    pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
376        let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
377
378        // Group trigrams into posting lists
379        for (trigram, location) in trigrams {
380            temp_map
381                .entry(trigram)
382                .or_insert_with(Vec::new)
383                .push(location);
384        }
385
386        // Convert to sorted Vec for binary search
387        self.index = temp_map.into_iter().collect();
388
389        // Clear temp_index since we're using the Vec directly
390        self.temp_index = None;
391
392        // Finalize immediately (sort and deduplicate)
393        self.finalize();
394    }
395
396    /// Finalize the index by sorting all posting lists and the index itself
397    ///
398    /// Must be called after all files are indexed, before querying.
399    /// Converts the HashMap to a sorted Vec for fast binary search.
400    ///
401    /// If batch flushing was enabled, finalization will be deferred until write()
402    /// is called, which will perform streaming merge directly to disk.
403    pub fn finalize(&mut self) {
404        // If we have partial indices from batch flushing, DON'T merge yet
405        // We'll do streaming merge in write() or write_with_streaming_merge()
406        if !self.partial_indices.is_empty() {
407            log::info!(
408                "Deferring finalization - will stream merge {} partial indices during write()",
409                self.partial_indices.len()
410            );
411
412            // Flush final batch if temp_index is not empty
413            if let Some(ref temp_map) = self.temp_index {
414                if !temp_map.is_empty() {
415                    self.flush_batch().expect("Failed to flush final batch");
416                }
417            }
418
419            // Don't merge yet - write() will handle it
420            return;
421        }
422
423        // Standard finalization (no batch flushing)
424        // Convert HashMap to Vec if we have a temp index
425        if let Some(temp_map) = self.temp_index.take() {
426            self.index = temp_map.into_iter().collect();
427        }
428
429        // Sort, deduplicate, and cap posting lists
430        let cap = self.max_posting_list_entries;
431        for (trigram, list) in self.index.iter_mut() {
432            list.sort_unstable();
433            list.dedup(); // Remove duplicates (same trigram appearing multiple times on same line)
434            if cap > 0 && list.len() > cap {
435                log::warn!(
436                    "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
437                    trigram,
438                    list.len(),
439                    cap
440                );
441                list.truncate(cap);
442            }
443        }
444
445        // Sort the index by trigram for binary search
446        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
447    }
448
449    /// Merge all partial indices directly to trigrams.bin using streaming k-way merge
450    ///
451    /// This avoids loading the entire index into RAM by:
452    /// 1. Opening all partial index files as readers
453    /// 2. Performing k-way merge using a priority queue
454    /// 3. Writing compressed posting lists directly to disk
455    /// 4. Never accumulating more than K posting lists in memory at once
456    fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
457        use std::cmp::Ordering;
458        use std::collections::BinaryHeap;
459        use std::io::{BufReader, BufWriter, Read};
460
461        log::info!(
462            "Streaming merge of {} partial indices to {:?}",
463            self.partial_indices.len(),
464            output_path
465        );
466
467        // Open all partial indices as buffered readers
468        struct PartialIndexReader {
469            reader: BufReader<File>,
470            current_trigram: Option<Trigram>,
471            current_posting_list: Vec<FileLocation>,
472            reader_id: usize,
473        }
474
475        let mut readers: Vec<PartialIndexReader> = Vec::new();
476
477        for (idx, partial_path) in self.partial_indices.iter().enumerate() {
478            let file = File::open(partial_path)
479                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
480            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
481
482            // Read number of trigrams (we don't need it for streaming merge)
483            let mut buf = [0u8; 8];
484            reader.read_exact(&mut buf)?;
485
486            readers.push(PartialIndexReader {
487                reader,
488                current_trigram: None,
489                current_posting_list: Vec::new(),
490                reader_id: idx,
491            });
492        }
493
494        // Helper to read next trigram from a reader
495        fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
496            // Try to read trigram
497            let mut trigram_buf = [0u8; 4];
498            match reader.reader.read_exact(&mut trigram_buf) {
499                Ok(_) => {
500                    let trigram = u32::from_le_bytes(trigram_buf);
501
502                    // Read posting list size
503                    let mut len_buf = [0u8; 4];
504                    reader.reader.read_exact(&mut len_buf)?;
505                    let list_len = u32::from_le_bytes(len_buf) as usize;
506
507                    // Read all locations for this trigram
508                    let mut locations = Vec::with_capacity(list_len);
509                    for _ in 0..list_len {
510                        let mut loc_buf = [0u8; 12];
511                        reader.reader.read_exact(&mut loc_buf)?;
512
513                        let file_id =
514                            u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
515                        let line_no =
516                            u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
517                        let byte_offset =
518                            u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
519
520                        locations.push(FileLocation {
521                            file_id,
522                            line_no,
523                            byte_offset,
524                        });
525                    }
526
527                    reader.current_trigram = Some(trigram);
528                    reader.current_posting_list = locations;
529                    Ok(true)
530                }
531                Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
532                    reader.current_trigram = None;
533                    Ok(false)
534                }
535                Err(e) => Err(e.into()),
536            }
537        }
538
539        // Initialize: read first trigram from each reader
540        for reader in &mut readers {
541            read_next_trigram(reader)?;
542        }
543
544        // Priority queue entry for k-way merge
545        #[derive(Eq, PartialEq)]
546        struct HeapEntry {
547            trigram: Trigram,
548            reader_id: usize,
549        }
550
551        impl Ord for HeapEntry {
552            fn cmp(&self, other: &Self) -> Ordering {
553                // Reverse for min-heap
554                other
555                    .trigram
556                    .cmp(&self.trigram)
557                    .then_with(|| other.reader_id.cmp(&self.reader_id))
558            }
559        }
560
561        impl PartialOrd for HeapEntry {
562            fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
563                Some(self.cmp(other))
564            }
565        }
566
567        // Build initial heap
568        let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
569        for reader in &readers {
570            if let Some(trigram) = reader.current_trigram {
571                heap.push(HeapEntry {
572                    trigram,
573                    reader_id: reader.reader_id,
574                });
575            }
576        }
577
578        // Open output file for writing
579        let file = OpenOptions::new()
580            .create(true)
581            .write(true)
582            .truncate(true)
583            .open(output_path)
584            .with_context(|| format!("Failed to create {}", output_path.display()))?;
585
586        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
587
588        // Write placeholder header (we'll update it at the end)
589        writer.write_all(MAGIC)?;
590        writer.write_all(&VERSION.to_le_bytes())?;
591        writer.write_all(&0u64.to_le_bytes())?; // num_trigrams (placeholder)
592        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
593
594        // We'll build the directory as we go
595        let mut directory: Vec<DirectoryEntry> = Vec::new();
596        let mut num_trigrams = 0u64;
597
598        // K-way merge loop
599        let mut current_trigram: Option<Trigram> = None;
600        let mut merged_locations: Vec<FileLocation> = Vec::new();
601
602        while let Some(entry) = heap.pop() {
603            let reader = &mut readers[entry.reader_id];
604
605            // If this is a new trigram, write the previous one
606            if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
607                // Write the accumulated posting list for current_trigram
608                let trigram = current_trigram.unwrap();
609                merged_locations.sort_unstable();
610                merged_locations.dedup();
611
612                let cap = self.max_posting_list_entries;
613                if cap > 0 && merged_locations.len() > cap {
614                    log::warn!(
615                        "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
616                        trigram,
617                        merged_locations.len(),
618                        cap
619                    );
620                    merged_locations.truncate(cap);
621                }
622
623                // Compress and write this trigram's posting list
624                let data_offset = writer.stream_position()?;
625                let compressed_size =
626                    self.write_compressed_posting_list(&mut writer, &merged_locations)?;
627
628                directory.push(DirectoryEntry {
629                    trigram,
630                    data_offset,
631                    compressed_size,
632                });
633
634                num_trigrams += 1;
635                merged_locations.clear();
636            }
637
638            // Set current trigram
639            current_trigram = Some(entry.trigram);
640
641            // Merge this reader's posting list into accumulated list
642            merged_locations.extend_from_slice(&reader.current_posting_list);
643
644            // Advance this reader to next trigram
645            if read_next_trigram(reader)? {
646                if let Some(next_trigram) = reader.current_trigram {
647                    heap.push(HeapEntry {
648                        trigram: next_trigram,
649                        reader_id: entry.reader_id,
650                    });
651                }
652            }
653        }
654
655        // Write final trigram
656        if let Some(trigram) = current_trigram {
657            merged_locations.sort_unstable();
658            merged_locations.dedup();
659
660            let cap = self.max_posting_list_entries;
661            if cap > 0 && merged_locations.len() > cap {
662                log::warn!(
663                    "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
664                    trigram,
665                    merged_locations.len(),
666                    cap
667                );
668                merged_locations.truncate(cap);
669            }
670
671            let data_offset = writer.stream_position()?;
672            let compressed_size =
673                self.write_compressed_posting_list(&mut writer, &merged_locations)?;
674
675            directory.push(DirectoryEntry {
676                trigram,
677                data_offset,
678                compressed_size,
679            });
680
681            num_trigrams += 1;
682        }
683
684        log::info!(
685            "Merged {} trigrams from {} partial indices",
686            num_trigrams,
687            self.partial_indices.len()
688        );
689
690        // Remember where data section ended (not used but kept for clarity)
691        let _data_end_pos = writer.stream_position()?;
692
693        // Write file paths after data section
694        for file_path in &self.files {
695            let path_str = file_path.to_string_lossy();
696            let path_bytes = path_str.as_bytes();
697            write_varint(&mut writer, path_bytes.len() as u32)?;
698            writer.write_all(path_bytes)?;
699        }
700
701        // Flush before we rewrite the beginning
702        writer.flush()?;
703        drop(writer);
704
705        // Now we need to insert the directory at the beginning
706        // We'll read the data+files we just wrote, then rewrite the file with directory in between
707        use std::io::{Seek, SeekFrom};
708
709        // Read data and files sections
710        let mut temp_data = Vec::new();
711        {
712            let mut file = File::open(output_path)?;
713            file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
714            file.read_to_end(&mut temp_data)?;
715        }
716
717        // Rewrite file with correct structure
718        let file = OpenOptions::new()
719            .write(true)
720            .truncate(true)
721            .open(output_path)?;
722        let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
723
724        // Write header with correct num_trigrams
725        writer.write_all(MAGIC)?;
726        writer.write_all(&VERSION.to_le_bytes())?;
727        writer.write_all(&num_trigrams.to_le_bytes())?;
728        writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
729
730        // Write directory
731        for entry in &directory {
732            writer.write_all(&entry.trigram.to_le_bytes())?;
733            // Adjust data offset to account for directory size
734            let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
735            writer.write_all(&adjusted_offset.to_le_bytes())?;
736            writer.write_all(&entry.compressed_size.to_le_bytes())?;
737        }
738
739        // Write data and files sections
740        writer.write_all(&temp_data)?;
741
742        // Flush and sync
743        writer.flush()?;
744        writer.get_ref().sync_all()?;
745
746        // Clean up partial index files
747        for partial_path in &self.partial_indices {
748            let _ = std::fs::remove_file(partial_path);
749        }
750        if let Some(ref temp_dir) = self.temp_dir {
751            let _ = std::fs::remove_dir(temp_dir);
752        }
753
754        log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
755
756        Ok(())
757    }
758
759    /// Write a compressed posting list to the writer and return the compressed size
760    fn write_compressed_posting_list(
761        &self,
762        writer: &mut impl Write,
763        locations: &[FileLocation],
764    ) -> Result<u32> {
765        let mut compressed = Vec::new();
766
767        // Compress posting list using delta+varint encoding
768        let mut prev_file_id = 0u32;
769        let mut prev_line_no = 0u32;
770        let mut prev_byte_offset = 0u32;
771
772        for loc in locations {
773            // Compute deltas
774            let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
775            let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
776            let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
777
778            // Write deltas as varints
779            write_varint(&mut compressed, file_id_delta)?;
780            write_varint(&mut compressed, line_no_delta)?;
781            write_varint(&mut compressed, byte_offset_delta)?;
782
783            // Update previous values
784            prev_file_id = loc.file_id;
785            prev_line_no = loc.line_no;
786            prev_byte_offset = loc.byte_offset;
787        }
788
789        let compressed_size = compressed.len() as u32;
790        writer.write_all(&compressed)?;
791
792        Ok(compressed_size)
793    }
794
795    /// Merge all partial indices into self.index (old in-memory approach - deprecated)
796    #[allow(dead_code)]
797    fn merge_partial_indices(&mut self) -> Result<()> {
798        use std::io::{BufReader, Read};
799
800        // Read all partial indices into memory (simplified approach for now)
801        let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
802
803        for partial_path in &self.partial_indices {
804            let file = File::open(partial_path)
805                .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
806            let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
807
808            // Read number of trigrams
809            let mut buf = [0u8; 8];
810            reader.read_exact(&mut buf)?;
811            let num_trigrams = u64::from_le_bytes(buf) as usize;
812
813            // Read each (trigram, posting_list)
814            for _ in 0..num_trigrams {
815                // Read trigram
816                let mut trigram_buf = [0u8; 4];
817                reader.read_exact(&mut trigram_buf)?;
818                let trigram = u32::from_le_bytes(trigram_buf);
819
820                // Read posting list size
821                let mut len_buf = [0u8; 4];
822                reader.read_exact(&mut len_buf)?;
823                let list_len = u32::from_le_bytes(len_buf) as usize;
824
825                // Read all locations
826                for _ in 0..list_len {
827                    let mut loc_buf = [0u8; 12]; // 3 * u32
828                    reader.read_exact(&mut loc_buf)?;
829
830                    let file_id =
831                        u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
832                    let line_no =
833                        u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
834                    let byte_offset =
835                        u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
836
837                    all_entries.push((
838                        trigram,
839                        FileLocation {
840                            file_id,
841                            line_no,
842                            byte_offset,
843                        },
844                    ));
845                }
846            }
847        }
848
849        log::info!(
850            "Read {} total trigram entries from {} partial indices",
851            all_entries.len(),
852            self.partial_indices.len()
853        );
854
855        // Group by trigram
856        let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
857        for (trigram, location) in all_entries {
858            index_map
859                .entry(trigram)
860                .or_insert_with(Vec::new)
861                .push(location);
862        }
863
864        // Convert to sorted vec
865        self.index = index_map.into_iter().collect();
866
867        // Sort and deduplicate posting lists
868        for (_, list) in self.index.iter_mut() {
869            list.sort_unstable();
870            list.dedup();
871        }
872
873        // Sort by trigram
874        self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
875
876        // Clean up partial index files
877        for partial_path in &self.partial_indices {
878            let _ = std::fs::remove_file(partial_path);
879        }
880        if let Some(ref temp_dir) = self.temp_dir {
881            let _ = std::fs::remove_dir(temp_dir);
882        }
883
884        log::info!("Merged into final index with {} trigrams", self.index.len());
885
886        Ok(())
887    }
888
889    /// Search for a plain text pattern
890    ///
891    /// Returns candidate file locations that could contain the pattern.
892    /// Caller must verify actual matches.
893    ///
894    /// In lazy-loaded mode: Decompresses posting lists on-demand from mmap.
895    /// In in-memory mode: Uses pre-loaded posting lists.
896    pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
897        if pattern.len() < 3 {
898            // Pattern too short for trigrams - caller must fall back to full scan
899            return vec![];
900        }
901
902        let trigrams = extract_trigrams(pattern);
903        if trigrams.is_empty() {
904            return vec![];
905        }
906
907        // Check if we're in lazy-loaded mode or in-memory mode
908        if let Some(ref mmap) = self.mmap {
909            // Lazy-loaded mode: decompress posting lists on-demand
910            let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
911
912            for trigram in &trigrams {
913                // Binary search directory for this trigram
914                match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
915                    Ok(idx) => {
916                        let entry = &self.directory[idx];
917                        // Decompress this posting list on-demand
918                        match decompress_posting_list(
919                            mmap,
920                            entry.data_offset,
921                            entry.compressed_size,
922                        ) {
923                            Ok(locations) => posting_lists.push(locations),
924                            Err(e) => {
925                                log::warn!(
926                                    "Failed to decompress posting list for trigram {}: {}",
927                                    trigram,
928                                    e
929                                );
930                                return vec![];
931                            }
932                        }
933                    }
934                    Err(_) => {
935                        // Trigram not found - pattern cannot match
936                        return vec![];
937                    }
938                }
939            }
940
941            if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
942                return vec![];
943            }
944
945            // Sort by list size (smallest first for efficient intersection)
946            posting_lists.sort_by_key(|list| list.len());
947
948            // Intersect posting lists (owned version)
949            intersect_by_file_owned(&posting_lists)
950        } else {
951            // In-memory mode: use pre-loaded index
952            let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
953                .iter()
954                .filter_map(|t| {
955                    self.index
956                        .binary_search_by_key(t, |(trigram, _)| *trigram)
957                        .ok()
958                        .map(|idx| &self.index[idx].1)
959                })
960                .collect();
961
962            if posting_lists.is_empty() {
963                return vec![];
964            }
965
966            if posting_lists.len() < trigrams.len() {
967                // Some trigrams missing - pattern cannot match
968                return vec![];
969            }
970
971            // Sort by list size (smallest first for efficient intersection)
972            posting_lists.sort_by_key(|list| list.len());
973
974            // Intersect posting lists (reference version)
975            intersect_by_file(&posting_lists)
976        }
977    }
978
979    /// Get posting list for a specific trigram (for debugging)
980    pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
981        self.index
982            .binary_search_by_key(&trigram, |(t, _)| *t)
983            .ok()
984            .map(|idx| &self.index[idx].1)
985    }
986
987    /// Write the trigram index to disk
988    ///
989    /// Binary format V3 (lazy-loadable with directory + data separation):
990    /// - Header (24 bytes): magic, version, num_trigrams, num_files
991    /// - Directory Section (16 bytes per trigram):
992    ///   - trigram: u32 (4 bytes)
993    ///   - data_offset: u64 (8 bytes) - absolute offset in file
994    ///   - compressed_size: u32 (4 bytes) - size of compressed posting list
995    /// - Data Section (variable size):
996    ///   - Compressed posting lists (delta+varint encoded)
997    /// - File Paths Section (variable size):
998    ///   - path_len: varint
999    ///   - path_bytes: [u8; path_len]
1000    pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
1001        let path = path.as_ref();
1002
1003        // If we have partial indices from batch flushing, use streaming merge
1004        if !self.partial_indices.is_empty() {
1005            log::info!(
1006                "Using streaming merge to write {} partial indices",
1007                self.partial_indices.len()
1008            );
1009            return self.merge_partial_indices_to_file(path);
1010        }
1011
1012        // Standard write path (no batch flushing).
1013        // Non-atomic: writes directly to the target path with truncate(true).
1014        // On disk-full mid-write the file is left corrupt; the indexer fast-path
1015        // validates the "RFTG" magic bytes on re-index, forcing a clean rebuild.
1016        let file = OpenOptions::new()
1017            .create(true)
1018            .write(true)
1019            .truncate(true)
1020            .open(path)
1021            .with_context(|| format!("Failed to create {}", path.display()))?;
1022
1023        // Use a large buffer (16MB) for streaming writes
1024        let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
1025
1026        // Write header
1027        writer.write_all(MAGIC)?;
1028        writer.write_all(&VERSION.to_le_bytes())?;
1029        writer.write_all(&(self.index.len() as u64).to_le_bytes())?; // num_trigrams
1030        writer.write_all(&(self.files.len() as u64).to_le_bytes())?; // num_files
1031
1032        // Build directory and write compressed data in a single pass
1033        let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
1034
1035        // Calculate directory start and size
1036        let directory_start = HEADER_SIZE as u64;
1037        let directory_size = self.index.len() * 16;
1038
1039        // Reserve space for directory (we'll write it after data)
1040        let data_start = directory_start + directory_size as u64;
1041        let mut current_offset = data_start;
1042
1043        // We need to write in the correct order: header, directory, data, file paths
1044        // But we need data offsets to write directory
1045        // So we compress data first, then write header+directory+data
1046
1047        // Step 1: Compress all posting lists and track offsets
1048        let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
1049
1050        for (trigram, locations) in &self.index {
1051            // Compress the posting list
1052            let mut compressed = Vec::new();
1053            let mut prev_file_id = 0u32;
1054            let mut prev_line_no = 0u32;
1055            let mut prev_byte_offset = 0u32;
1056
1057            for loc in locations {
1058                let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
1059                let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
1060                let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
1061
1062                write_varint(&mut compressed, file_id_delta)?;
1063                write_varint(&mut compressed, line_no_delta)?;
1064                write_varint(&mut compressed, byte_offset_delta)?;
1065
1066                prev_file_id = loc.file_id;
1067                prev_line_no = loc.line_no;
1068                prev_byte_offset = loc.byte_offset;
1069            }
1070
1071            directory.push(DirectoryEntry {
1072                trigram: *trigram,
1073                data_offset: current_offset,
1074                compressed_size: compressed.len() as u32,
1075            });
1076            current_offset += compressed.len() as u64;
1077
1078            compressed_lists.push((*trigram, compressed));
1079        }
1080
1081        // Step 2: Write directory
1082        for entry in &directory {
1083            writer.write_all(&entry.trigram.to_le_bytes())?;
1084            writer.write_all(&entry.data_offset.to_le_bytes())?;
1085            writer.write_all(&entry.compressed_size.to_le_bytes())?;
1086        }
1087
1088        // Step 3: Write data section (compressed posting lists)
1089        for (_, compressed) in &compressed_lists {
1090            writer.write_all(compressed)?;
1091        }
1092
1093        // Step 4: Write file paths
1094        for file_path in &self.files {
1095            let path_str = file_path.to_string_lossy();
1096            let path_bytes = path_str.as_bytes();
1097            write_varint(&mut writer, path_bytes.len() as u32)?;
1098            writer.write_all(path_bytes)?;
1099        }
1100
1101        // Flush and sync
1102        writer.flush()?;
1103        writer.get_ref().sync_all()?;
1104
1105        log::info!(
1106            "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1107            self.index.len(),
1108            self.files.len(),
1109            path
1110        );
1111
1112        Ok(())
1113    }
1114
1115    /// Load trigram index from disk using memory-mapped I/O with lazy loading
1116    ///
1117    /// Binary format V3: Only reads the directory and file paths, keeps posting lists compressed in mmap.
1118    /// Posting lists are decompressed on-demand during search queries.
1119    pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1120        let path = path.as_ref();
1121
1122        let file =
1123            File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
1124
1125        // Memory-map the file (keep it alive for lazy access)
1126        let mmap = unsafe {
1127            memmap2::Mmap::map(&file)
1128                .with_context(|| format!("Failed to mmap {}", path.display()))?
1129        };
1130
1131        // Validate header
1132        if mmap.len() < HEADER_SIZE {
1133            anyhow::bail!(
1134                "trigrams.bin too small (expected at least {} bytes)",
1135                HEADER_SIZE
1136            );
1137        }
1138
1139        if &mmap[0..4] != MAGIC {
1140            anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1141        }
1142
1143        let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1144        if version != VERSION {
1145            anyhow::bail!(
1146                "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1147                version,
1148                VERSION
1149            );
1150        }
1151
1152        let num_trigrams = u64::from_le_bytes([
1153            mmap[8], mmap[9], mmap[10], mmap[11], mmap[12], mmap[13], mmap[14], mmap[15],
1154        ]) as usize;
1155
1156        let num_files = u64::from_le_bytes([
1157            mmap[16], mmap[17], mmap[18], mmap[19], mmap[20], mmap[21], mmap[22], mmap[23],
1158        ]) as usize;
1159
1160        log::debug!(
1161            "Loading lazy trigram index: {} trigrams, {} files",
1162            num_trigrams,
1163            num_files
1164        );
1165
1166        // Read directory (trigram → offset mappings) - fast, just metadata
1167        let mut directory = Vec::with_capacity(num_trigrams);
1168        let mut pos = HEADER_SIZE;
1169        let directory_size = num_trigrams * 16; // 16 bytes per entry
1170
1171        for _ in 0..num_trigrams {
1172            if pos + 16 > mmap.len() {
1173                anyhow::bail!("Truncated directory entry at pos={}", pos);
1174            }
1175
1176            let trigram =
1177                u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]]);
1178            pos += 4;
1179
1180            let data_offset = u64::from_le_bytes([
1181                mmap[pos],
1182                mmap[pos + 1],
1183                mmap[pos + 2],
1184                mmap[pos + 3],
1185                mmap[pos + 4],
1186                mmap[pos + 5],
1187                mmap[pos + 6],
1188                mmap[pos + 7],
1189            ]);
1190            pos += 8;
1191
1192            let compressed_size =
1193                u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]]);
1194            pos += 4;
1195
1196            directory.push(DirectoryEntry {
1197                trigram,
1198                data_offset,
1199                compressed_size,
1200            });
1201        }
1202
1203        // Directory is already sorted by trigram (from write())
1204        directory.sort_unstable_by_key(|e| e.trigram);
1205
1206        // Calculate where file paths section starts (after header + directory + data)
1207        let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1208        let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1209        pos = files_section_offset;
1210
1211        // Read file paths (varint-encoded lengths)
1212        let mut files = Vec::with_capacity(num_files);
1213        for _ in 0..num_files {
1214            // Read path length (varint)
1215            let (path_len, consumed) = read_varint(&mmap[pos..])?;
1216            pos += consumed;
1217            let path_len = path_len as usize;
1218
1219            if pos + path_len > mmap.len() {
1220                anyhow::bail!("Truncated file path at pos={}", pos);
1221            }
1222
1223            let path_bytes = &mmap[pos..pos + path_len];
1224            let path_str = std::str::from_utf8(path_bytes).context("Invalid UTF-8 in file path")?;
1225            files.push(PathBuf::from(path_str));
1226            pos += path_len;
1227        }
1228
1229        log::info!(
1230            "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1231            num_trigrams,
1232            num_files,
1233            directory_size / 1024
1234        );
1235
1236        Ok(Self {
1237            index: Vec::new(), // Empty in lazy mode
1238            files,
1239            temp_index: None,
1240            mmap: Some(mmap), // Keep mmap alive for lazy decompression!
1241            directory,
1242            partial_indices: Vec::new(),
1243            temp_dir: None,
1244            max_posting_list_entries: 0,
1245        })
1246    }
1247}
1248
1249impl Default for TrigramIndex {
1250    fn default() -> Self {
1251        Self::new()
1252    }
1253}
1254
1255/// Extract all trigrams from text
1256///
1257/// Returns a vector of trigrams (without location info).
1258pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1259    let bytes = text.as_bytes();
1260    let mut trigrams = Vec::new();
1261
1262    for i in 0..bytes.len().saturating_sub(2) {
1263        let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1264        trigrams.push(trigram);
1265    }
1266
1267    trigrams
1268}
1269
1270/// Extract trigrams with file location information
1271///
1272/// Returns a vector of (trigram, location) pairs for building the inverted index.
1273pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1274    let bytes = text.as_bytes();
1275    let mut result = Vec::new();
1276
1277    let mut line_no = 1;
1278
1279    for (i, &byte) in bytes.iter().enumerate() {
1280        // Track newlines
1281        if byte == b'\n' {
1282            line_no += 1;
1283        }
1284
1285        // Extract trigram
1286        if i + 2 < bytes.len() {
1287            let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1288            let location = FileLocation::new(file_id, line_no, i as u32);
1289            result.push((trigram, location));
1290        }
1291    }
1292
1293    result
1294}
1295
1296/// Convert 3 bytes to a trigram (packed u32)
1297#[inline]
1298fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1299    debug_assert_eq!(bytes.len(), 3);
1300    (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1301}
1302
1303/// Convert trigram back to bytes (for debugging)
1304#[allow(dead_code)]
1305fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1306    [
1307        ((trigram >> 16) & 0xFF) as u8,
1308        ((trigram >> 8) & 0xFF) as u8,
1309        (trigram & 0xFF) as u8,
1310    ]
1311}
1312
1313/// Intersect posting lists by (file_id, line_no) pairs
1314///
1315/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1316/// This ensures accurate full-text matching.
1317fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1318    if lists.is_empty() {
1319        return vec![];
1320    }
1321
1322    use std::collections::HashSet;
1323
1324    // Create a set of (file_id, line_no) pairs from the first list
1325    let mut candidates: HashSet<(u32, u32)> = lists[0]
1326        .iter()
1327        .map(|loc| (loc.file_id, loc.line_no))
1328        .collect();
1329
1330    // Intersect with (file_id, line_no) pairs from other lists
1331    for &list in &lists[1..] {
1332        let list_pairs: HashSet<(u32, u32)> =
1333            list.iter().map(|loc| (loc.file_id, loc.line_no)).collect();
1334        candidates.retain(|pair| list_pairs.contains(pair));
1335    }
1336
1337    // Convert back to FileLocation results
1338    let mut result = Vec::new();
1339    for &(file_id, line_no) in &candidates {
1340        // Find a location matching this (file_id, line_no) from the first list
1341        if let Some(loc) = lists[0]
1342            .iter()
1343            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1344        {
1345            result.push(*loc);
1346        }
1347    }
1348
1349    result.sort_unstable();
1350    result
1351}
1352
1353/// Intersect posting lists by (file_id, line_no) pairs (owned version for lazy-loading)
1354///
1355/// Similar to intersect_by_file() but works with owned Vec<Vec<FileLocation>>
1356/// instead of references. Used in lazy-loading mode where posting lists are decompressed on-demand.
1357///
1358/// Returns locations where ALL trigrams appear on the SAME line (not just in the same file).
1359fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1360    if lists.is_empty() {
1361        return vec![];
1362    }
1363
1364    use std::collections::HashSet;
1365
1366    // Create a set of (file_id, line_no) pairs from the first list
1367    let mut candidates: HashSet<(u32, u32)> = lists[0]
1368        .iter()
1369        .map(|loc| (loc.file_id, loc.line_no))
1370        .collect();
1371
1372    // Intersect with (file_id, line_no) pairs from other lists
1373    for list in &lists[1..] {
1374        let list_pairs: HashSet<(u32, u32)> =
1375            list.iter().map(|loc| (loc.file_id, loc.line_no)).collect();
1376        candidates.retain(|pair| list_pairs.contains(pair));
1377    }
1378
1379    // Convert back to FileLocation results
1380    let mut result = Vec::new();
1381    for &(file_id, line_no) in &candidates {
1382        // Find a location matching this (file_id, line_no) from the first list
1383        if let Some(loc) = lists[0]
1384            .iter()
1385            .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1386        {
1387            result.push(*loc);
1388        }
1389    }
1390
1391    result.sort_unstable();
1392    result
1393}
1394
1395#[cfg(test)]
1396mod tests {
1397    use super::*;
1398
1399    #[test]
1400    fn test_extract_trigrams() {
1401        let text = "hello";
1402        let trigrams = extract_trigrams(text);
1403
1404        // "hello" → "hel", "ell", "llo"
1405        assert_eq!(trigrams.len(), 3);
1406
1407        // Verify trigrams are unique
1408        let expected = vec![
1409            bytes_to_trigram(b"hel"),
1410            bytes_to_trigram(b"ell"),
1411            bytes_to_trigram(b"llo"),
1412        ];
1413        assert_eq!(trigrams, expected);
1414    }
1415
1416    #[test]
1417    fn test_extract_trigrams_short() {
1418        assert_eq!(extract_trigrams("ab").len(), 0);
1419        assert_eq!(extract_trigrams("abc").len(), 1);
1420    }
1421
1422    #[test]
1423    fn test_bytes_to_trigram() {
1424        let trigram1 = bytes_to_trigram(b"abc");
1425        let trigram2 = bytes_to_trigram(b"abc");
1426        let trigram3 = bytes_to_trigram(b"xyz");
1427
1428        assert_eq!(trigram1, trigram2);
1429        assert_ne!(trigram1, trigram3);
1430    }
1431
1432    #[test]
1433    fn test_trigram_roundtrip() {
1434        let original = b"foo";
1435        let trigram = bytes_to_trigram(original);
1436        let recovered = trigram_to_bytes(trigram);
1437        assert_eq!(original, &recovered);
1438    }
1439
1440    #[test]
1441    fn test_extract_with_locations() {
1442        let text = "hello\nworld";
1443        let locs = extract_trigrams_with_locations(text, 0);
1444
1445        // "hello\nworld" has 9 trigrams:
1446        // "hel", "ell", "llo", "lo\n", "o\nw", "\nwo", "wor", "orl", "rld"
1447        assert_eq!(locs.len(), 9);
1448
1449        // First trigram should be on line 1
1450        assert_eq!(locs[0].1.line_no, 1);
1451
1452        // After newline, should be line 2
1453        let world_start = text.find("world").unwrap();
1454        let world_trigram_idx = locs
1455            .iter()
1456            .position(|(_, loc)| loc.byte_offset as usize == world_start)
1457            .unwrap();
1458        assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1459    }
1460
1461    #[test]
1462    fn test_trigram_index_basic() {
1463        let mut index = TrigramIndex::new();
1464
1465        let file_id = index.add_file(PathBuf::from("test.txt"));
1466        index.index_file(file_id, "hello world");
1467        index.finalize();
1468
1469        // Search for "hello"
1470        let results = index.search("hello");
1471        assert!(!results.is_empty());
1472
1473        // Search for "world"
1474        let results = index.search("world");
1475        assert!(!results.is_empty());
1476
1477        // Search for "goodbye" (not in text)
1478        let results = index.search("goodbye");
1479        assert!(results.is_empty());
1480    }
1481
1482    #[test]
1483    fn test_search_multifile() {
1484        let mut index = TrigramIndex::new();
1485
1486        let file1 = index.add_file(PathBuf::from("file1.txt"));
1487        let file2 = index.add_file(PathBuf::from("file2.txt"));
1488
1489        index.index_file(file1, "extract_symbols is here");
1490        index.index_file(file2, "extract_symbols is also here");
1491        index.finalize();
1492
1493        let results = index.search("extract_symbols");
1494        assert_eq!(results.len(), 2); // One result per file
1495
1496        // Verify we got both files
1497        let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1498        assert!(file_ids.contains(&file1));
1499        assert!(file_ids.contains(&file2));
1500    }
1501
1502    #[test]
1503    fn test_persistence_write() {
1504        use tempfile::TempDir;
1505
1506        let temp = TempDir::new().unwrap();
1507        let trigrams_path = temp.path().join("trigrams.bin");
1508
1509        // Build and write index
1510        let mut index = TrigramIndex::new();
1511        let file1 = index.add_file(PathBuf::from("src/main.rs"));
1512        let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1513
1514        index.index_file(file1, "fn main() { println!(\"hello\"); }");
1515        index.index_file(
1516            file2,
1517            "pub fn hello() -> String { String::from(\"hello\") }",
1518        );
1519        index.finalize();
1520
1521        // Write to disk
1522        index.write(&trigrams_path).unwrap();
1523
1524        // Verify file was created
1525        assert!(trigrams_path.exists());
1526
1527        // Verify file has content (header + data)
1528        let metadata = std::fs::metadata(&trigrams_path).unwrap();
1529        assert!(metadata.len() > HEADER_SIZE as u64);
1530
1531        // Verify we can read the header back
1532        use std::io::Read;
1533        let mut file = File::open(&trigrams_path).unwrap();
1534        let mut magic = [0u8; 4];
1535        file.read_exact(&mut magic).unwrap();
1536        assert_eq!(&magic, MAGIC);
1537
1538        // Note: Full roundtrip test verifies write works correctly.
1539        // Load verification is tested in production via query performance tests.
1540    }
1541
1542    #[test]
1543    fn test_posting_list_cap_enforced() {
1544        let cap: usize = 10;
1545        let content = "aaa ".repeat(200);
1546        let mut index = TrigramIndex::new();
1547        index.set_max_posting_list_entries(cap);
1548        let file_id = index.add_file(PathBuf::from("dense.txt"));
1549        index.index_file(file_id, &content);
1550        index.finalize();
1551        let aaa = bytes_to_trigram(b"aaa");
1552        let list = index
1553            .get_posting_list(aaa)
1554            .expect("aaa trigram should exist");
1555        assert!(list.len() <= cap, "cap exceeded: {} > {}", list.len(), cap);
1556    }
1557
1558    #[test]
1559    fn test_posting_list_cap_zero_means_unlimited() {
1560        let repetitions = 50;
1561        let content = "aaa ".repeat(repetitions);
1562        let mut index = TrigramIndex::new();
1563        index.set_max_posting_list_entries(0);
1564        let file_id = index.add_file(PathBuf::from("dense.txt"));
1565        index.index_file(file_id, &content);
1566        index.finalize();
1567        let aaa = bytes_to_trigram(b"aaa");
1568        let list = index
1569            .get_posting_list(aaa)
1570            .expect("aaa trigram should exist");
1571        assert!(
1572            list.len() >= repetitions,
1573            "expected >= {} entries, got {}",
1574            repetitions,
1575            list.len()
1576        );
1577    }
1578}