1use anyhow::{Context, Result};
17use std::collections::HashMap;
18use std::fs::{File, OpenOptions};
19use std::io::Write;
20use std::path::{Path, PathBuf};
21
22pub type Trigram = u32;
24
25const MAGIC: &[u8; 4] = b"RFTG"; const VERSION: u32 = 3; #[allow(dead_code)]
30const HEADER_SIZE: usize = 24;
31
32fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
35 loop {
36 let mut byte = (value & 0x7F) as u8;
37 value >>= 7;
38 if value != 0 {
39 byte |= 0x80; }
41 writer.write_all(&[byte])?;
42 if value == 0 {
43 break;
44 }
45 }
46 Ok(())
47}
48
49fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
51 let mut value: u32 = 0;
52 let mut shift = 0;
53 let mut pos = 0;
54
55 loop {
56 if pos >= data.len() {
57 anyhow::bail!("Truncated varint");
58 }
59 let byte = data[pos];
60 pos += 1;
61
62 value |= ((byte & 0x7F) as u32) << shift;
63 if byte & 0x80 == 0 {
64 break;
65 }
66 shift += 7;
67 if shift >= 32 {
68 anyhow::bail!("Varint too large");
69 }
70 }
71
72 Ok((value, pos))
73}
74
75fn decompress_posting_list(mmap: &[u8], offset: u64, size: u32) -> Result<Vec<FileLocation>> {
85 let start = offset as usize;
86 let end = start + size as usize;
87
88 if end > mmap.len() {
89 anyhow::bail!(
90 "Posting list out of bounds: offset={}, size={}, mmap_len={}",
91 offset,
92 size,
93 mmap.len()
94 );
95 }
96
97 let compressed_data = &mmap[start..end];
98
99 let mut locations = Vec::new();
101 let mut pos = 0;
102 let mut prev_file_id = 0u32;
103 let mut prev_line_no = 0u32;
104 let mut prev_byte_offset = 0u32;
105
106 while pos < compressed_data.len() {
107 let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
109 pos += consumed;
110
111 let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
113 pos += consumed;
114
115 let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
117 pos += consumed;
118
119 let file_id = prev_file_id.wrapping_add(file_id_delta);
121 let line_no = prev_line_no.wrapping_add(line_no_delta);
122 let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
123
124 locations.push(FileLocation {
125 file_id,
126 line_no,
127 byte_offset,
128 });
129
130 prev_file_id = file_id;
132 prev_line_no = line_no;
133 prev_byte_offset = byte_offset;
134 }
135
136 Ok(locations)
137}
138
139#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
141pub struct FileLocation {
142 pub file_id: u32,
144 pub line_no: u32,
146 pub byte_offset: u32,
148}
149
150impl FileLocation {
151 pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
152 Self {
153 file_id,
154 line_no,
155 byte_offset,
156 }
157 }
158}
159
160#[derive(Debug, Clone)]
165struct DirectoryEntry {
166 trigram: Trigram,
168 data_offset: u64,
170 compressed_size: u32,
172}
173
174pub struct TrigramIndex {
185 index: Vec<(Trigram, Vec<FileLocation>)>,
188 files: Vec<PathBuf>,
190 temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
192 mmap: Option<memmap2::Mmap>,
194 directory: Vec<DirectoryEntry>,
196 partial_indices: Vec<PathBuf>,
198 temp_dir: Option<PathBuf>,
200 max_posting_list_entries: usize,
203}
204
205impl TrigramIndex {
206 pub fn new() -> Self {
208 Self {
209 index: Vec::new(),
210 files: Vec::new(),
211 temp_index: Some(HashMap::new()),
212 mmap: None,
213 directory: Vec::new(),
214 partial_indices: Vec::new(),
215 temp_dir: None,
216 max_posting_list_entries: 0,
217 }
218 }
219
220 pub fn set_max_posting_list_entries(&mut self, cap: usize) {
222 self.max_posting_list_entries = cap;
223 }
224
225 pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
230 std::fs::create_dir_all(&temp_dir)
231 .context("Failed to create temp directory for batch flushing")?;
232 self.temp_dir = Some(temp_dir);
233 log::info!("Enabled batch-flush mode for trigram index");
234 Ok(())
235 }
236
237 pub fn flush_batch(&mut self) -> Result<()> {
242 let temp_dir = self.temp_dir.as_ref().ok_or_else(|| {
243 anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first")
244 })?;
245
246 let temp_map = self
248 .temp_index
249 .take()
250 .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
251
252 if temp_map.is_empty() {
253 self.temp_index = Some(HashMap::new());
255 return Ok(());
256 }
257
258 let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
260
261 for (_, list) in partial_index.iter_mut() {
263 list.sort_unstable();
264 list.dedup();
265 }
266
267 partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
269
270 let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
272 self.write_partial_index(&partial_file, &partial_index)?;
273
274 self.partial_indices.push(partial_file);
275
276 self.temp_index = Some(HashMap::new());
278
279 log::debug!(
280 "Flushed batch {} with {} trigrams to disk",
281 self.partial_indices.len(),
282 partial_index.len()
283 );
284
285 Ok(())
286 }
287
288 fn write_partial_index(
290 &self,
291 path: &Path,
292 index: &[(Trigram, Vec<FileLocation>)],
293 ) -> Result<()> {
294 use std::io::BufWriter;
295
296 let file = OpenOptions::new()
297 .create(true)
298 .write(true)
299 .truncate(true)
300 .open(path)?;
301
302 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
303
304 writer.write_all(&(index.len() as u64).to_le_bytes())?;
306
307 for (trigram, locations) in index {
309 writer.write_all(&trigram.to_le_bytes())?;
310 writer.write_all(&(locations.len() as u32).to_le_bytes())?;
311
312 for loc in locations {
313 writer.write_all(&loc.file_id.to_le_bytes())?;
314 writer.write_all(&loc.line_no.to_le_bytes())?;
315 writer.write_all(&loc.byte_offset.to_le_bytes())?;
316 }
317 }
318
319 writer.flush()?;
320 Ok(())
321 }
322
323 pub fn add_file(&mut self, path: PathBuf) -> u32 {
325 let file_id = self.files.len() as u32;
326 self.files.push(path);
327 file_id
328 }
329
330 pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
332 self.files.get(file_id as usize)
333 }
334
335 pub fn file_count(&self) -> usize {
337 self.files.len()
338 }
339
340 pub fn trigram_count(&self) -> usize {
342 if !self.directory.is_empty() {
343 self.directory.len()
345 } else {
346 self.index.len()
348 }
349 }
350
351 pub fn index_file(&mut self, file_id: u32, content: &str) {
356 let trigrams = extract_trigrams_with_locations(content, file_id);
357
358 if let Some(ref mut temp_map) = self.temp_index {
360 for (trigram, location) in trigrams {
361 temp_map
362 .entry(trigram)
363 .or_insert_with(Vec::new)
364 .push(location);
365 }
366 } else {
367 panic!("Cannot call index_file() after finalize(). Index is read-only.");
368 }
369 }
370
371 pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
376 let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
377
378 for (trigram, location) in trigrams {
380 temp_map
381 .entry(trigram)
382 .or_insert_with(Vec::new)
383 .push(location);
384 }
385
386 self.index = temp_map.into_iter().collect();
388
389 self.temp_index = None;
391
392 self.finalize();
394 }
395
396 pub fn finalize(&mut self) {
404 if !self.partial_indices.is_empty() {
407 log::info!(
408 "Deferring finalization - will stream merge {} partial indices during write()",
409 self.partial_indices.len()
410 );
411
412 if let Some(ref temp_map) = self.temp_index {
414 if !temp_map.is_empty() {
415 self.flush_batch().expect("Failed to flush final batch");
416 }
417 }
418
419 return;
421 }
422
423 if let Some(temp_map) = self.temp_index.take() {
426 self.index = temp_map.into_iter().collect();
427 }
428
429 let cap = self.max_posting_list_entries;
431 for (trigram, list) in self.index.iter_mut() {
432 list.sort_unstable();
433 list.dedup(); if cap > 0 && list.len() > cap {
435 log::warn!(
436 "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
437 trigram,
438 list.len(),
439 cap
440 );
441 list.truncate(cap);
442 }
443 }
444
445 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
447 }
448
449 fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
457 use std::cmp::Ordering;
458 use std::collections::BinaryHeap;
459 use std::io::{BufReader, BufWriter, Read};
460
461 log::info!(
462 "Streaming merge of {} partial indices to {:?}",
463 self.partial_indices.len(),
464 output_path
465 );
466
467 struct PartialIndexReader {
469 reader: BufReader<File>,
470 current_trigram: Option<Trigram>,
471 current_posting_list: Vec<FileLocation>,
472 reader_id: usize,
473 }
474
475 let mut readers: Vec<PartialIndexReader> = Vec::new();
476
477 for (idx, partial_path) in self.partial_indices.iter().enumerate() {
478 let file = File::open(partial_path)
479 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
480 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
481
482 let mut buf = [0u8; 8];
484 reader.read_exact(&mut buf)?;
485
486 readers.push(PartialIndexReader {
487 reader,
488 current_trigram: None,
489 current_posting_list: Vec::new(),
490 reader_id: idx,
491 });
492 }
493
494 fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
496 let mut trigram_buf = [0u8; 4];
498 match reader.reader.read_exact(&mut trigram_buf) {
499 Ok(_) => {
500 let trigram = u32::from_le_bytes(trigram_buf);
501
502 let mut len_buf = [0u8; 4];
504 reader.reader.read_exact(&mut len_buf)?;
505 let list_len = u32::from_le_bytes(len_buf) as usize;
506
507 let mut locations = Vec::with_capacity(list_len);
509 for _ in 0..list_len {
510 let mut loc_buf = [0u8; 12];
511 reader.reader.read_exact(&mut loc_buf)?;
512
513 let file_id =
514 u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
515 let line_no =
516 u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
517 let byte_offset =
518 u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
519
520 locations.push(FileLocation {
521 file_id,
522 line_no,
523 byte_offset,
524 });
525 }
526
527 reader.current_trigram = Some(trigram);
528 reader.current_posting_list = locations;
529 Ok(true)
530 }
531 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
532 reader.current_trigram = None;
533 Ok(false)
534 }
535 Err(e) => Err(e.into()),
536 }
537 }
538
539 for reader in &mut readers {
541 read_next_trigram(reader)?;
542 }
543
544 #[derive(Eq, PartialEq)]
546 struct HeapEntry {
547 trigram: Trigram,
548 reader_id: usize,
549 }
550
551 impl Ord for HeapEntry {
552 fn cmp(&self, other: &Self) -> Ordering {
553 other
555 .trigram
556 .cmp(&self.trigram)
557 .then_with(|| other.reader_id.cmp(&self.reader_id))
558 }
559 }
560
561 impl PartialOrd for HeapEntry {
562 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
563 Some(self.cmp(other))
564 }
565 }
566
567 let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
569 for reader in &readers {
570 if let Some(trigram) = reader.current_trigram {
571 heap.push(HeapEntry {
572 trigram,
573 reader_id: reader.reader_id,
574 });
575 }
576 }
577
578 let file = OpenOptions::new()
580 .create(true)
581 .write(true)
582 .truncate(true)
583 .open(output_path)
584 .with_context(|| format!("Failed to create {}", output_path.display()))?;
585
586 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
587
588 writer.write_all(MAGIC)?;
590 writer.write_all(&VERSION.to_le_bytes())?;
591 writer.write_all(&0u64.to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::new();
596 let mut num_trigrams = 0u64;
597
598 let mut current_trigram: Option<Trigram> = None;
600 let mut merged_locations: Vec<FileLocation> = Vec::new();
601
602 while let Some(entry) = heap.pop() {
603 let reader = &mut readers[entry.reader_id];
604
605 if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
607 let trigram = current_trigram.unwrap();
609 merged_locations.sort_unstable();
610 merged_locations.dedup();
611
612 let cap = self.max_posting_list_entries;
613 if cap > 0 && merged_locations.len() > cap {
614 log::warn!(
615 "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
616 trigram,
617 merged_locations.len(),
618 cap
619 );
620 merged_locations.truncate(cap);
621 }
622
623 let data_offset = writer.stream_position()?;
625 let compressed_size =
626 self.write_compressed_posting_list(&mut writer, &merged_locations)?;
627
628 directory.push(DirectoryEntry {
629 trigram,
630 data_offset,
631 compressed_size,
632 });
633
634 num_trigrams += 1;
635 merged_locations.clear();
636 }
637
638 current_trigram = Some(entry.trigram);
640
641 merged_locations.extend_from_slice(&reader.current_posting_list);
643
644 if read_next_trigram(reader)? {
646 if let Some(next_trigram) = reader.current_trigram {
647 heap.push(HeapEntry {
648 trigram: next_trigram,
649 reader_id: entry.reader_id,
650 });
651 }
652 }
653 }
654
655 if let Some(trigram) = current_trigram {
657 merged_locations.sort_unstable();
658 merged_locations.dedup();
659
660 let cap = self.max_posting_list_entries;
661 if cap > 0 && merged_locations.len() > cap {
662 log::warn!(
663 "Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.",
664 trigram,
665 merged_locations.len(),
666 cap
667 );
668 merged_locations.truncate(cap);
669 }
670
671 let data_offset = writer.stream_position()?;
672 let compressed_size =
673 self.write_compressed_posting_list(&mut writer, &merged_locations)?;
674
675 directory.push(DirectoryEntry {
676 trigram,
677 data_offset,
678 compressed_size,
679 });
680
681 num_trigrams += 1;
682 }
683
684 log::info!(
685 "Merged {} trigrams from {} partial indices",
686 num_trigrams,
687 self.partial_indices.len()
688 );
689
690 let _data_end_pos = writer.stream_position()?;
692
693 for file_path in &self.files {
695 let path_str = file_path.to_string_lossy();
696 let path_bytes = path_str.as_bytes();
697 write_varint(&mut writer, path_bytes.len() as u32)?;
698 writer.write_all(path_bytes)?;
699 }
700
701 writer.flush()?;
703 drop(writer);
704
705 use std::io::{Seek, SeekFrom};
708
709 let mut temp_data = Vec::new();
711 {
712 let mut file = File::open(output_path)?;
713 file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
714 file.read_to_end(&mut temp_data)?;
715 }
716
717 let file = OpenOptions::new()
719 .write(true)
720 .truncate(true)
721 .open(output_path)?;
722 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
723
724 writer.write_all(MAGIC)?;
726 writer.write_all(&VERSION.to_le_bytes())?;
727 writer.write_all(&num_trigrams.to_le_bytes())?;
728 writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
729
730 for entry in &directory {
732 writer.write_all(&entry.trigram.to_le_bytes())?;
733 let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
735 writer.write_all(&adjusted_offset.to_le_bytes())?;
736 writer.write_all(&entry.compressed_size.to_le_bytes())?;
737 }
738
739 writer.write_all(&temp_data)?;
741
742 writer.flush()?;
744 writer.get_ref().sync_all()?;
745
746 for partial_path in &self.partial_indices {
748 let _ = std::fs::remove_file(partial_path);
749 }
750 if let Some(ref temp_dir) = self.temp_dir {
751 let _ = std::fs::remove_dir(temp_dir);
752 }
753
754 log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
755
756 Ok(())
757 }
758
759 fn write_compressed_posting_list(
761 &self,
762 writer: &mut impl Write,
763 locations: &[FileLocation],
764 ) -> Result<u32> {
765 let mut compressed = Vec::new();
766
767 let mut prev_file_id = 0u32;
769 let mut prev_line_no = 0u32;
770 let mut prev_byte_offset = 0u32;
771
772 for loc in locations {
773 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
775 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
776 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
777
778 write_varint(&mut compressed, file_id_delta)?;
780 write_varint(&mut compressed, line_no_delta)?;
781 write_varint(&mut compressed, byte_offset_delta)?;
782
783 prev_file_id = loc.file_id;
785 prev_line_no = loc.line_no;
786 prev_byte_offset = loc.byte_offset;
787 }
788
789 let compressed_size = compressed.len() as u32;
790 writer.write_all(&compressed)?;
791
792 Ok(compressed_size)
793 }
794
795 #[allow(dead_code)]
797 fn merge_partial_indices(&mut self) -> Result<()> {
798 use std::io::{BufReader, Read};
799
800 let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
802
803 for partial_path in &self.partial_indices {
804 let file = File::open(partial_path)
805 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
806 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
807
808 let mut buf = [0u8; 8];
810 reader.read_exact(&mut buf)?;
811 let num_trigrams = u64::from_le_bytes(buf) as usize;
812
813 for _ in 0..num_trigrams {
815 let mut trigram_buf = [0u8; 4];
817 reader.read_exact(&mut trigram_buf)?;
818 let trigram = u32::from_le_bytes(trigram_buf);
819
820 let mut len_buf = [0u8; 4];
822 reader.read_exact(&mut len_buf)?;
823 let list_len = u32::from_le_bytes(len_buf) as usize;
824
825 for _ in 0..list_len {
827 let mut loc_buf = [0u8; 12]; reader.read_exact(&mut loc_buf)?;
829
830 let file_id =
831 u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
832 let line_no =
833 u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
834 let byte_offset =
835 u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
836
837 all_entries.push((
838 trigram,
839 FileLocation {
840 file_id,
841 line_no,
842 byte_offset,
843 },
844 ));
845 }
846 }
847 }
848
849 log::info!(
850 "Read {} total trigram entries from {} partial indices",
851 all_entries.len(),
852 self.partial_indices.len()
853 );
854
855 let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
857 for (trigram, location) in all_entries {
858 index_map
859 .entry(trigram)
860 .or_insert_with(Vec::new)
861 .push(location);
862 }
863
864 self.index = index_map.into_iter().collect();
866
867 for (_, list) in self.index.iter_mut() {
869 list.sort_unstable();
870 list.dedup();
871 }
872
873 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
875
876 for partial_path in &self.partial_indices {
878 let _ = std::fs::remove_file(partial_path);
879 }
880 if let Some(ref temp_dir) = self.temp_dir {
881 let _ = std::fs::remove_dir(temp_dir);
882 }
883
884 log::info!("Merged into final index with {} trigrams", self.index.len());
885
886 Ok(())
887 }
888
889 pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
897 if pattern.len() < 3 {
898 return vec![];
900 }
901
902 let trigrams = extract_trigrams(pattern);
903 if trigrams.is_empty() {
904 return vec![];
905 }
906
907 if let Some(ref mmap) = self.mmap {
909 let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
911
912 for trigram in &trigrams {
913 match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
915 Ok(idx) => {
916 let entry = &self.directory[idx];
917 match decompress_posting_list(
919 mmap,
920 entry.data_offset,
921 entry.compressed_size,
922 ) {
923 Ok(locations) => posting_lists.push(locations),
924 Err(e) => {
925 log::warn!(
926 "Failed to decompress posting list for trigram {}: {}",
927 trigram,
928 e
929 );
930 return vec![];
931 }
932 }
933 }
934 Err(_) => {
935 return vec![];
937 }
938 }
939 }
940
941 if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
942 return vec![];
943 }
944
945 posting_lists.sort_by_key(|list| list.len());
947
948 intersect_by_file_owned(&posting_lists)
950 } else {
951 let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
953 .iter()
954 .filter_map(|t| {
955 self.index
956 .binary_search_by_key(t, |(trigram, _)| *trigram)
957 .ok()
958 .map(|idx| &self.index[idx].1)
959 })
960 .collect();
961
962 if posting_lists.is_empty() {
963 return vec![];
964 }
965
966 if posting_lists.len() < trigrams.len() {
967 return vec![];
969 }
970
971 posting_lists.sort_by_key(|list| list.len());
973
974 intersect_by_file(&posting_lists)
976 }
977 }
978
979 pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
981 self.index
982 .binary_search_by_key(&trigram, |(t, _)| *t)
983 .ok()
984 .map(|idx| &self.index[idx].1)
985 }
986
987 pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
1001 let path = path.as_ref();
1002
1003 if !self.partial_indices.is_empty() {
1005 log::info!(
1006 "Using streaming merge to write {} partial indices",
1007 self.partial_indices.len()
1008 );
1009 return self.merge_partial_indices_to_file(path);
1010 }
1011
1012 let file = OpenOptions::new()
1017 .create(true)
1018 .write(true)
1019 .truncate(true)
1020 .open(path)
1021 .with_context(|| format!("Failed to create {}", path.display()))?;
1022
1023 let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
1025
1026 writer.write_all(MAGIC)?;
1028 writer.write_all(&VERSION.to_le_bytes())?;
1029 writer.write_all(&(self.index.len() as u64).to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
1034
1035 let directory_start = HEADER_SIZE as u64;
1037 let directory_size = self.index.len() * 16;
1038
1039 let data_start = directory_start + directory_size as u64;
1041 let mut current_offset = data_start;
1042
1043 let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
1049
1050 for (trigram, locations) in &self.index {
1051 let mut compressed = Vec::new();
1053 let mut prev_file_id = 0u32;
1054 let mut prev_line_no = 0u32;
1055 let mut prev_byte_offset = 0u32;
1056
1057 for loc in locations {
1058 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
1059 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
1060 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
1061
1062 write_varint(&mut compressed, file_id_delta)?;
1063 write_varint(&mut compressed, line_no_delta)?;
1064 write_varint(&mut compressed, byte_offset_delta)?;
1065
1066 prev_file_id = loc.file_id;
1067 prev_line_no = loc.line_no;
1068 prev_byte_offset = loc.byte_offset;
1069 }
1070
1071 directory.push(DirectoryEntry {
1072 trigram: *trigram,
1073 data_offset: current_offset,
1074 compressed_size: compressed.len() as u32,
1075 });
1076 current_offset += compressed.len() as u64;
1077
1078 compressed_lists.push((*trigram, compressed));
1079 }
1080
1081 for entry in &directory {
1083 writer.write_all(&entry.trigram.to_le_bytes())?;
1084 writer.write_all(&entry.data_offset.to_le_bytes())?;
1085 writer.write_all(&entry.compressed_size.to_le_bytes())?;
1086 }
1087
1088 for (_, compressed) in &compressed_lists {
1090 writer.write_all(compressed)?;
1091 }
1092
1093 for file_path in &self.files {
1095 let path_str = file_path.to_string_lossy();
1096 let path_bytes = path_str.as_bytes();
1097 write_varint(&mut writer, path_bytes.len() as u32)?;
1098 writer.write_all(path_bytes)?;
1099 }
1100
1101 writer.flush()?;
1103 writer.get_ref().sync_all()?;
1104
1105 log::info!(
1106 "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1107 self.index.len(),
1108 self.files.len(),
1109 path
1110 );
1111
1112 Ok(())
1113 }
1114
1115 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1120 let path = path.as_ref();
1121
1122 let file =
1123 File::open(path).with_context(|| format!("Failed to open {}", path.display()))?;
1124
1125 let mmap = unsafe {
1127 memmap2::Mmap::map(&file)
1128 .with_context(|| format!("Failed to mmap {}", path.display()))?
1129 };
1130
1131 if mmap.len() < HEADER_SIZE {
1133 anyhow::bail!(
1134 "trigrams.bin too small (expected at least {} bytes)",
1135 HEADER_SIZE
1136 );
1137 }
1138
1139 if &mmap[0..4] != MAGIC {
1140 anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1141 }
1142
1143 let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1144 if version != VERSION {
1145 anyhow::bail!(
1146 "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1147 version,
1148 VERSION
1149 );
1150 }
1151
1152 let num_trigrams = u64::from_le_bytes([
1153 mmap[8], mmap[9], mmap[10], mmap[11], mmap[12], mmap[13], mmap[14], mmap[15],
1154 ]) as usize;
1155
1156 let num_files = u64::from_le_bytes([
1157 mmap[16], mmap[17], mmap[18], mmap[19], mmap[20], mmap[21], mmap[22], mmap[23],
1158 ]) as usize;
1159
1160 log::debug!(
1161 "Loading lazy trigram index: {} trigrams, {} files",
1162 num_trigrams,
1163 num_files
1164 );
1165
1166 let mut directory = Vec::with_capacity(num_trigrams);
1168 let mut pos = HEADER_SIZE;
1169 let directory_size = num_trigrams * 16; for _ in 0..num_trigrams {
1172 if pos + 16 > mmap.len() {
1173 anyhow::bail!("Truncated directory entry at pos={}", pos);
1174 }
1175
1176 let trigram =
1177 u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]]);
1178 pos += 4;
1179
1180 let data_offset = u64::from_le_bytes([
1181 mmap[pos],
1182 mmap[pos + 1],
1183 mmap[pos + 2],
1184 mmap[pos + 3],
1185 mmap[pos + 4],
1186 mmap[pos + 5],
1187 mmap[pos + 6],
1188 mmap[pos + 7],
1189 ]);
1190 pos += 8;
1191
1192 let compressed_size =
1193 u32::from_le_bytes([mmap[pos], mmap[pos + 1], mmap[pos + 2], mmap[pos + 3]]);
1194 pos += 4;
1195
1196 directory.push(DirectoryEntry {
1197 trigram,
1198 data_offset,
1199 compressed_size,
1200 });
1201 }
1202
1203 directory.sort_unstable_by_key(|e| e.trigram);
1205
1206 let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1208 let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1209 pos = files_section_offset;
1210
1211 let mut files = Vec::with_capacity(num_files);
1213 for _ in 0..num_files {
1214 let (path_len, consumed) = read_varint(&mmap[pos..])?;
1216 pos += consumed;
1217 let path_len = path_len as usize;
1218
1219 if pos + path_len > mmap.len() {
1220 anyhow::bail!("Truncated file path at pos={}", pos);
1221 }
1222
1223 let path_bytes = &mmap[pos..pos + path_len];
1224 let path_str = std::str::from_utf8(path_bytes).context("Invalid UTF-8 in file path")?;
1225 files.push(PathBuf::from(path_str));
1226 pos += path_len;
1227 }
1228
1229 log::info!(
1230 "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1231 num_trigrams,
1232 num_files,
1233 directory_size / 1024
1234 );
1235
1236 Ok(Self {
1237 index: Vec::new(), files,
1239 temp_index: None,
1240 mmap: Some(mmap), directory,
1242 partial_indices: Vec::new(),
1243 temp_dir: None,
1244 max_posting_list_entries: 0,
1245 })
1246 }
1247}
1248
1249impl Default for TrigramIndex {
1250 fn default() -> Self {
1251 Self::new()
1252 }
1253}
1254
1255pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1259 let bytes = text.as_bytes();
1260 let mut trigrams = Vec::new();
1261
1262 for i in 0..bytes.len().saturating_sub(2) {
1263 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1264 trigrams.push(trigram);
1265 }
1266
1267 trigrams
1268}
1269
1270pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1274 let bytes = text.as_bytes();
1275 let mut result = Vec::new();
1276
1277 let mut line_no = 1;
1278
1279 for (i, &byte) in bytes.iter().enumerate() {
1280 if byte == b'\n' {
1282 line_no += 1;
1283 }
1284
1285 if i + 2 < bytes.len() {
1287 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1288 let location = FileLocation::new(file_id, line_no, i as u32);
1289 result.push((trigram, location));
1290 }
1291 }
1292
1293 result
1294}
1295
1296#[inline]
1298fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1299 debug_assert_eq!(bytes.len(), 3);
1300 (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1301}
1302
1303#[allow(dead_code)]
1305fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1306 [
1307 ((trigram >> 16) & 0xFF) as u8,
1308 ((trigram >> 8) & 0xFF) as u8,
1309 (trigram & 0xFF) as u8,
1310 ]
1311}
1312
1313fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1318 if lists.is_empty() {
1319 return vec![];
1320 }
1321
1322 use std::collections::HashSet;
1323
1324 let mut candidates: HashSet<(u32, u32)> = lists[0]
1326 .iter()
1327 .map(|loc| (loc.file_id, loc.line_no))
1328 .collect();
1329
1330 for &list in &lists[1..] {
1332 let list_pairs: HashSet<(u32, u32)> =
1333 list.iter().map(|loc| (loc.file_id, loc.line_no)).collect();
1334 candidates.retain(|pair| list_pairs.contains(pair));
1335 }
1336
1337 let mut result = Vec::new();
1339 for &(file_id, line_no) in &candidates {
1340 if let Some(loc) = lists[0]
1342 .iter()
1343 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1344 {
1345 result.push(*loc);
1346 }
1347 }
1348
1349 result.sort_unstable();
1350 result
1351}
1352
1353fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1360 if lists.is_empty() {
1361 return vec![];
1362 }
1363
1364 use std::collections::HashSet;
1365
1366 let mut candidates: HashSet<(u32, u32)> = lists[0]
1368 .iter()
1369 .map(|loc| (loc.file_id, loc.line_no))
1370 .collect();
1371
1372 for list in &lists[1..] {
1374 let list_pairs: HashSet<(u32, u32)> =
1375 list.iter().map(|loc| (loc.file_id, loc.line_no)).collect();
1376 candidates.retain(|pair| list_pairs.contains(pair));
1377 }
1378
1379 let mut result = Vec::new();
1381 for &(file_id, line_no) in &candidates {
1382 if let Some(loc) = lists[0]
1384 .iter()
1385 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1386 {
1387 result.push(*loc);
1388 }
1389 }
1390
1391 result.sort_unstable();
1392 result
1393}
1394
1395#[cfg(test)]
1396mod tests {
1397 use super::*;
1398
1399 #[test]
1400 fn test_extract_trigrams() {
1401 let text = "hello";
1402 let trigrams = extract_trigrams(text);
1403
1404 assert_eq!(trigrams.len(), 3);
1406
1407 let expected = vec![
1409 bytes_to_trigram(b"hel"),
1410 bytes_to_trigram(b"ell"),
1411 bytes_to_trigram(b"llo"),
1412 ];
1413 assert_eq!(trigrams, expected);
1414 }
1415
1416 #[test]
1417 fn test_extract_trigrams_short() {
1418 assert_eq!(extract_trigrams("ab").len(), 0);
1419 assert_eq!(extract_trigrams("abc").len(), 1);
1420 }
1421
1422 #[test]
1423 fn test_bytes_to_trigram() {
1424 let trigram1 = bytes_to_trigram(b"abc");
1425 let trigram2 = bytes_to_trigram(b"abc");
1426 let trigram3 = bytes_to_trigram(b"xyz");
1427
1428 assert_eq!(trigram1, trigram2);
1429 assert_ne!(trigram1, trigram3);
1430 }
1431
1432 #[test]
1433 fn test_trigram_roundtrip() {
1434 let original = b"foo";
1435 let trigram = bytes_to_trigram(original);
1436 let recovered = trigram_to_bytes(trigram);
1437 assert_eq!(original, &recovered);
1438 }
1439
1440 #[test]
1441 fn test_extract_with_locations() {
1442 let text = "hello\nworld";
1443 let locs = extract_trigrams_with_locations(text, 0);
1444
1445 assert_eq!(locs.len(), 9);
1448
1449 assert_eq!(locs[0].1.line_no, 1);
1451
1452 let world_start = text.find("world").unwrap();
1454 let world_trigram_idx = locs
1455 .iter()
1456 .position(|(_, loc)| loc.byte_offset as usize == world_start)
1457 .unwrap();
1458 assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1459 }
1460
1461 #[test]
1462 fn test_trigram_index_basic() {
1463 let mut index = TrigramIndex::new();
1464
1465 let file_id = index.add_file(PathBuf::from("test.txt"));
1466 index.index_file(file_id, "hello world");
1467 index.finalize();
1468
1469 let results = index.search("hello");
1471 assert!(!results.is_empty());
1472
1473 let results = index.search("world");
1475 assert!(!results.is_empty());
1476
1477 let results = index.search("goodbye");
1479 assert!(results.is_empty());
1480 }
1481
1482 #[test]
1483 fn test_search_multifile() {
1484 let mut index = TrigramIndex::new();
1485
1486 let file1 = index.add_file(PathBuf::from("file1.txt"));
1487 let file2 = index.add_file(PathBuf::from("file2.txt"));
1488
1489 index.index_file(file1, "extract_symbols is here");
1490 index.index_file(file2, "extract_symbols is also here");
1491 index.finalize();
1492
1493 let results = index.search("extract_symbols");
1494 assert_eq!(results.len(), 2); let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1498 assert!(file_ids.contains(&file1));
1499 assert!(file_ids.contains(&file2));
1500 }
1501
1502 #[test]
1503 fn test_persistence_write() {
1504 use tempfile::TempDir;
1505
1506 let temp = TempDir::new().unwrap();
1507 let trigrams_path = temp.path().join("trigrams.bin");
1508
1509 let mut index = TrigramIndex::new();
1511 let file1 = index.add_file(PathBuf::from("src/main.rs"));
1512 let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1513
1514 index.index_file(file1, "fn main() { println!(\"hello\"); }");
1515 index.index_file(
1516 file2,
1517 "pub fn hello() -> String { String::from(\"hello\") }",
1518 );
1519 index.finalize();
1520
1521 index.write(&trigrams_path).unwrap();
1523
1524 assert!(trigrams_path.exists());
1526
1527 let metadata = std::fs::metadata(&trigrams_path).unwrap();
1529 assert!(metadata.len() > HEADER_SIZE as u64);
1530
1531 use std::io::Read;
1533 let mut file = File::open(&trigrams_path).unwrap();
1534 let mut magic = [0u8; 4];
1535 file.read_exact(&mut magic).unwrap();
1536 assert_eq!(&magic, MAGIC);
1537
1538 }
1541
1542 #[test]
1543 fn test_posting_list_cap_enforced() {
1544 let cap: usize = 10;
1545 let content = "aaa ".repeat(200);
1546 let mut index = TrigramIndex::new();
1547 index.set_max_posting_list_entries(cap);
1548 let file_id = index.add_file(PathBuf::from("dense.txt"));
1549 index.index_file(file_id, &content);
1550 index.finalize();
1551 let aaa = bytes_to_trigram(b"aaa");
1552 let list = index
1553 .get_posting_list(aaa)
1554 .expect("aaa trigram should exist");
1555 assert!(list.len() <= cap, "cap exceeded: {} > {}", list.len(), cap);
1556 }
1557
1558 #[test]
1559 fn test_posting_list_cap_zero_means_unlimited() {
1560 let repetitions = 50;
1561 let content = "aaa ".repeat(repetitions);
1562 let mut index = TrigramIndex::new();
1563 index.set_max_posting_list_entries(0);
1564 let file_id = index.add_file(PathBuf::from("dense.txt"));
1565 index.index_file(file_id, &content);
1566 index.finalize();
1567 let aaa = bytes_to_trigram(b"aaa");
1568 let list = index
1569 .get_posting_list(aaa)
1570 .expect("aaa trigram should exist");
1571 assert!(
1572 list.len() >= repetitions,
1573 "expected >= {} entries, got {}",
1574 repetitions,
1575 list.len()
1576 );
1577 }
1578}