1use anyhow::{Context, Result};
17use std::collections::HashMap;
18use std::fs::{File, OpenOptions};
19use std::io::Write;
20use std::path::{Path, PathBuf};
21
22pub type Trigram = u32;
24
25const MAGIC: &[u8; 4] = b"RFTG"; const VERSION: u32 = 3; #[allow(dead_code)]
30const HEADER_SIZE: usize = 24;
31
32fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
35 loop {
36 let mut byte = (value & 0x7F) as u8;
37 value >>= 7;
38 if value != 0 {
39 byte |= 0x80; }
41 writer.write_all(&[byte])?;
42 if value == 0 {
43 break;
44 }
45 }
46 Ok(())
47}
48
49fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
51 let mut value: u32 = 0;
52 let mut shift = 0;
53 let mut pos = 0;
54
55 loop {
56 if pos >= data.len() {
57 anyhow::bail!("Truncated varint");
58 }
59 let byte = data[pos];
60 pos += 1;
61
62 value |= ((byte & 0x7F) as u32) << shift;
63 if byte & 0x80 == 0 {
64 break;
65 }
66 shift += 7;
67 if shift >= 32 {
68 anyhow::bail!("Varint too large");
69 }
70 }
71
72 Ok((value, pos))
73}
74
75fn decompress_posting_list(
85 mmap: &[u8],
86 offset: u64,
87 size: u32,
88) -> Result<Vec<FileLocation>> {
89 let start = offset as usize;
90 let end = start + size as usize;
91
92 if end > mmap.len() {
93 anyhow::bail!(
94 "Posting list out of bounds: offset={}, size={}, mmap_len={}",
95 offset,
96 size,
97 mmap.len()
98 );
99 }
100
101 let compressed_data = &mmap[start..end];
102
103 let mut locations = Vec::new();
105 let mut pos = 0;
106 let mut prev_file_id = 0u32;
107 let mut prev_line_no = 0u32;
108 let mut prev_byte_offset = 0u32;
109
110 while pos < compressed_data.len() {
111 let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
113 pos += consumed;
114
115 let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
117 pos += consumed;
118
119 let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
121 pos += consumed;
122
123 let file_id = prev_file_id.wrapping_add(file_id_delta);
125 let line_no = prev_line_no.wrapping_add(line_no_delta);
126 let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
127
128 locations.push(FileLocation {
129 file_id,
130 line_no,
131 byte_offset,
132 });
133
134 prev_file_id = file_id;
136 prev_line_no = line_no;
137 prev_byte_offset = byte_offset;
138 }
139
140 Ok(locations)
141}
142
143#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
145pub struct FileLocation {
146 pub file_id: u32,
148 pub line_no: u32,
150 pub byte_offset: u32,
152}
153
154impl FileLocation {
155 pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
156 Self {
157 file_id,
158 line_no,
159 byte_offset,
160 }
161 }
162}
163
164#[derive(Debug, Clone)]
169struct DirectoryEntry {
170 trigram: Trigram,
172 data_offset: u64,
174 compressed_size: u32,
176}
177
178pub struct TrigramIndex {
189 index: Vec<(Trigram, Vec<FileLocation>)>,
192 files: Vec<PathBuf>,
194 temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
196 mmap: Option<memmap2::Mmap>,
198 directory: Vec<DirectoryEntry>,
200 partial_indices: Vec<PathBuf>,
202 temp_dir: Option<PathBuf>,
204 max_posting_list_entries: usize,
207}
208
209impl TrigramIndex {
210 pub fn new() -> Self {
212 Self {
213 index: Vec::new(),
214 files: Vec::new(),
215 temp_index: Some(HashMap::new()),
216 mmap: None,
217 directory: Vec::new(),
218 partial_indices: Vec::new(),
219 temp_dir: None,
220 max_posting_list_entries: 0,
221 }
222 }
223
224 pub fn set_max_posting_list_entries(&mut self, cap: usize) {
226 self.max_posting_list_entries = cap;
227 }
228
229 pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
234 std::fs::create_dir_all(&temp_dir)
235 .context("Failed to create temp directory for batch flushing")?;
236 self.temp_dir = Some(temp_dir);
237 log::info!("Enabled batch-flush mode for trigram index");
238 Ok(())
239 }
240
241 pub fn flush_batch(&mut self) -> Result<()> {
246 let temp_dir = self.temp_dir.as_ref()
247 .ok_or_else(|| anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first"))?;
248
249 let temp_map = self.temp_index.take()
251 .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
252
253 if temp_map.is_empty() {
254 self.temp_index = Some(HashMap::new());
256 return Ok(());
257 }
258
259 let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
261
262 for (_, list) in partial_index.iter_mut() {
264 list.sort_unstable();
265 list.dedup();
266 }
267
268 partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
270
271 let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
273 self.write_partial_index(&partial_file, &partial_index)?;
274
275 self.partial_indices.push(partial_file);
276
277 self.temp_index = Some(HashMap::new());
279
280 log::debug!(
281 "Flushed batch {} with {} trigrams to disk",
282 self.partial_indices.len(),
283 partial_index.len()
284 );
285
286 Ok(())
287 }
288
289 fn write_partial_index(
291 &self,
292 path: &Path,
293 index: &[(Trigram, Vec<FileLocation>)],
294 ) -> Result<()> {
295 use std::io::BufWriter;
296
297 let file = OpenOptions::new()
298 .create(true)
299 .write(true)
300 .truncate(true)
301 .open(path)?;
302
303 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
304
305 writer.write_all(&(index.len() as u64).to_le_bytes())?;
307
308 for (trigram, locations) in index {
310 writer.write_all(&trigram.to_le_bytes())?;
311 writer.write_all(&(locations.len() as u32).to_le_bytes())?;
312
313 for loc in locations {
314 writer.write_all(&loc.file_id.to_le_bytes())?;
315 writer.write_all(&loc.line_no.to_le_bytes())?;
316 writer.write_all(&loc.byte_offset.to_le_bytes())?;
317 }
318 }
319
320 writer.flush()?;
321 Ok(())
322 }
323
324 pub fn add_file(&mut self, path: PathBuf) -> u32 {
326 let file_id = self.files.len() as u32;
327 self.files.push(path);
328 file_id
329 }
330
331 pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
333 self.files.get(file_id as usize)
334 }
335
336 pub fn file_count(&self) -> usize {
338 self.files.len()
339 }
340
341 pub fn trigram_count(&self) -> usize {
343 if !self.directory.is_empty() {
344 self.directory.len()
346 } else {
347 self.index.len()
349 }
350 }
351
352 pub fn index_file(&mut self, file_id: u32, content: &str) {
357 let trigrams = extract_trigrams_with_locations(content, file_id);
358
359 if let Some(ref mut temp_map) = self.temp_index {
361 for (trigram, location) in trigrams {
362 temp_map
363 .entry(trigram)
364 .or_insert_with(Vec::new)
365 .push(location);
366 }
367 } else {
368 panic!("Cannot call index_file() after finalize(). Index is read-only.");
369 }
370 }
371
372 pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
377 let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
378
379 for (trigram, location) in trigrams {
381 temp_map
382 .entry(trigram)
383 .or_insert_with(Vec::new)
384 .push(location);
385 }
386
387 self.index = temp_map.into_iter().collect();
389
390 self.temp_index = None;
392
393 self.finalize();
395 }
396
397 pub fn finalize(&mut self) {
405 if !self.partial_indices.is_empty() {
408 log::info!("Deferring finalization - will stream merge {} partial indices during write()",
409 self.partial_indices.len());
410
411 if let Some(ref temp_map) = self.temp_index {
413 if !temp_map.is_empty() {
414 self.flush_batch().expect("Failed to flush final batch");
415 }
416 }
417
418 return;
420 }
421
422 if let Some(temp_map) = self.temp_index.take() {
425 self.index = temp_map.into_iter().collect();
426 }
427
428 let cap = self.max_posting_list_entries;
430 for (trigram, list) in self.index.iter_mut() {
431 list.sort_unstable();
432 list.dedup(); if cap > 0 && list.len() > cap {
434 log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, list.len(), cap);
435 list.truncate(cap);
436 }
437 }
438
439 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
441 }
442
443 fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
451 use std::io::{BufReader, BufWriter, Read};
452 use std::cmp::Ordering;
453 use std::collections::BinaryHeap;
454
455 log::info!("Streaming merge of {} partial indices to {:?}",
456 self.partial_indices.len(), output_path);
457
458 struct PartialIndexReader {
460 reader: BufReader<File>,
461 current_trigram: Option<Trigram>,
462 current_posting_list: Vec<FileLocation>,
463 reader_id: usize,
464 }
465
466 let mut readers: Vec<PartialIndexReader> = Vec::new();
467
468 for (idx, partial_path) in self.partial_indices.iter().enumerate() {
469 let file = File::open(partial_path)
470 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
471 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
472
473 let mut buf = [0u8; 8];
475 reader.read_exact(&mut buf)?;
476
477 readers.push(PartialIndexReader {
478 reader,
479 current_trigram: None,
480 current_posting_list: Vec::new(),
481 reader_id: idx,
482 });
483 }
484
485 fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
487 let mut trigram_buf = [0u8; 4];
489 match reader.reader.read_exact(&mut trigram_buf) {
490 Ok(_) => {
491 let trigram = u32::from_le_bytes(trigram_buf);
492
493 let mut len_buf = [0u8; 4];
495 reader.reader.read_exact(&mut len_buf)?;
496 let list_len = u32::from_le_bytes(len_buf) as usize;
497
498 let mut locations = Vec::with_capacity(list_len);
500 for _ in 0..list_len {
501 let mut loc_buf = [0u8; 12];
502 reader.reader.read_exact(&mut loc_buf)?;
503
504 let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
505 let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
506 let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
507
508 locations.push(FileLocation { file_id, line_no, byte_offset });
509 }
510
511 reader.current_trigram = Some(trigram);
512 reader.current_posting_list = locations;
513 Ok(true)
514 }
515 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
516 reader.current_trigram = None;
517 Ok(false)
518 }
519 Err(e) => Err(e.into()),
520 }
521 }
522
523 for reader in &mut readers {
525 read_next_trigram(reader)?;
526 }
527
528 #[derive(Eq, PartialEq)]
530 struct HeapEntry {
531 trigram: Trigram,
532 reader_id: usize,
533 }
534
535 impl Ord for HeapEntry {
536 fn cmp(&self, other: &Self) -> Ordering {
537 other.trigram.cmp(&self.trigram)
539 .then_with(|| other.reader_id.cmp(&self.reader_id))
540 }
541 }
542
543 impl PartialOrd for HeapEntry {
544 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
545 Some(self.cmp(other))
546 }
547 }
548
549 let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
551 for reader in &readers {
552 if let Some(trigram) = reader.current_trigram {
553 heap.push(HeapEntry {
554 trigram,
555 reader_id: reader.reader_id,
556 });
557 }
558 }
559
560 let file = OpenOptions::new()
562 .create(true)
563 .write(true)
564 .truncate(true)
565 .open(output_path)
566 .with_context(|| format!("Failed to create {}", output_path.display()))?;
567
568 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
569
570 writer.write_all(MAGIC)?;
572 writer.write_all(&VERSION.to_le_bytes())?;
573 writer.write_all(&0u64.to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::new();
578 let mut num_trigrams = 0u64;
579
580 let mut current_trigram: Option<Trigram> = None;
582 let mut merged_locations: Vec<FileLocation> = Vec::new();
583
584 while let Some(entry) = heap.pop() {
585 let reader = &mut readers[entry.reader_id];
586
587 if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
589 let trigram = current_trigram.unwrap();
591 merged_locations.sort_unstable();
592 merged_locations.dedup();
593
594 let cap = self.max_posting_list_entries;
595 if cap > 0 && merged_locations.len() > cap {
596 log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, merged_locations.len(), cap);
597 merged_locations.truncate(cap);
598 }
599
600 let data_offset = writer.stream_position()?;
602 let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
603
604 directory.push(DirectoryEntry {
605 trigram,
606 data_offset,
607 compressed_size,
608 });
609
610 num_trigrams += 1;
611 merged_locations.clear();
612 }
613
614 current_trigram = Some(entry.trigram);
616
617 merged_locations.extend_from_slice(&reader.current_posting_list);
619
620 if read_next_trigram(reader)? {
622 if let Some(next_trigram) = reader.current_trigram {
623 heap.push(HeapEntry {
624 trigram: next_trigram,
625 reader_id: entry.reader_id,
626 });
627 }
628 }
629 }
630
631 if let Some(trigram) = current_trigram {
633 merged_locations.sort_unstable();
634 merged_locations.dedup();
635
636 let cap = self.max_posting_list_entries;
637 if cap > 0 && merged_locations.len() > cap {
638 log::warn!("Trigram 0x{:06X} posting list has {} entries (cap {}); truncating.", trigram, merged_locations.len(), cap);
639 merged_locations.truncate(cap);
640 }
641
642 let data_offset = writer.stream_position()?;
643 let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
644
645 directory.push(DirectoryEntry {
646 trigram,
647 data_offset,
648 compressed_size,
649 });
650
651 num_trigrams += 1;
652 }
653
654 log::info!("Merged {} trigrams from {} partial indices", num_trigrams, self.partial_indices.len());
655
656 let _data_end_pos = writer.stream_position()?;
658
659 for file_path in &self.files {
661 let path_str = file_path.to_string_lossy();
662 let path_bytes = path_str.as_bytes();
663 write_varint(&mut writer, path_bytes.len() as u32)?;
664 writer.write_all(path_bytes)?;
665 }
666
667 writer.flush()?;
669 drop(writer);
670
671 use std::io::{Seek, SeekFrom};
674
675 let mut temp_data = Vec::new();
677 {
678 let mut file = File::open(output_path)?;
679 file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
680 file.read_to_end(&mut temp_data)?;
681 }
682
683 let file = OpenOptions::new().write(true).truncate(true).open(output_path)?;
685 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
686
687 writer.write_all(MAGIC)?;
689 writer.write_all(&VERSION.to_le_bytes())?;
690 writer.write_all(&num_trigrams.to_le_bytes())?;
691 writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
692
693 for entry in &directory {
695 writer.write_all(&entry.trigram.to_le_bytes())?;
696 let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
698 writer.write_all(&adjusted_offset.to_le_bytes())?;
699 writer.write_all(&entry.compressed_size.to_le_bytes())?;
700 }
701
702 writer.write_all(&temp_data)?;
704
705 writer.flush()?;
707 writer.get_ref().sync_all()?;
708
709 for partial_path in &self.partial_indices {
711 let _ = std::fs::remove_file(partial_path);
712 }
713 if let Some(ref temp_dir) = self.temp_dir {
714 let _ = std::fs::remove_dir(temp_dir);
715 }
716
717 log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
718
719 Ok(())
720 }
721
722 fn write_compressed_posting_list(
724 &self,
725 writer: &mut impl Write,
726 locations: &[FileLocation],
727 ) -> Result<u32> {
728 let mut compressed = Vec::new();
729
730 let mut prev_file_id = 0u32;
732 let mut prev_line_no = 0u32;
733 let mut prev_byte_offset = 0u32;
734
735 for loc in locations {
736 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
738 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
739 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
740
741 write_varint(&mut compressed, file_id_delta)?;
743 write_varint(&mut compressed, line_no_delta)?;
744 write_varint(&mut compressed, byte_offset_delta)?;
745
746 prev_file_id = loc.file_id;
748 prev_line_no = loc.line_no;
749 prev_byte_offset = loc.byte_offset;
750 }
751
752 let compressed_size = compressed.len() as u32;
753 writer.write_all(&compressed)?;
754
755 Ok(compressed_size)
756 }
757
758 #[allow(dead_code)]
760 fn merge_partial_indices(&mut self) -> Result<()> {
761 use std::io::{BufReader, Read};
762
763 let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
765
766 for partial_path in &self.partial_indices {
767 let file = File::open(partial_path)
768 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
769 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
770
771 let mut buf = [0u8; 8];
773 reader.read_exact(&mut buf)?;
774 let num_trigrams = u64::from_le_bytes(buf) as usize;
775
776 for _ in 0..num_trigrams {
778 let mut trigram_buf = [0u8; 4];
780 reader.read_exact(&mut trigram_buf)?;
781 let trigram = u32::from_le_bytes(trigram_buf);
782
783 let mut len_buf = [0u8; 4];
785 reader.read_exact(&mut len_buf)?;
786 let list_len = u32::from_le_bytes(len_buf) as usize;
787
788 for _ in 0..list_len {
790 let mut loc_buf = [0u8; 12]; reader.read_exact(&mut loc_buf)?;
792
793 let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
794 let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
795 let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
796
797 all_entries.push((trigram, FileLocation { file_id, line_no, byte_offset }));
798 }
799 }
800 }
801
802 log::info!("Read {} total trigram entries from {} partial indices",
803 all_entries.len(), self.partial_indices.len());
804
805 let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
807 for (trigram, location) in all_entries {
808 index_map
809 .entry(trigram)
810 .or_insert_with(Vec::new)
811 .push(location);
812 }
813
814 self.index = index_map.into_iter().collect();
816
817 for (_, list) in self.index.iter_mut() {
819 list.sort_unstable();
820 list.dedup();
821 }
822
823 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
825
826 for partial_path in &self.partial_indices {
828 let _ = std::fs::remove_file(partial_path);
829 }
830 if let Some(ref temp_dir) = self.temp_dir {
831 let _ = std::fs::remove_dir(temp_dir);
832 }
833
834 log::info!("Merged into final index with {} trigrams", self.index.len());
835
836 Ok(())
837 }
838
839 pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
847 if pattern.len() < 3 {
848 return vec![];
850 }
851
852 let trigrams = extract_trigrams(pattern);
853 if trigrams.is_empty() {
854 return vec![];
855 }
856
857 if let Some(ref mmap) = self.mmap {
859 let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
861
862 for trigram in &trigrams {
863 match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
865 Ok(idx) => {
866 let entry = &self.directory[idx];
867 match decompress_posting_list(mmap, entry.data_offset, entry.compressed_size) {
869 Ok(locations) => posting_lists.push(locations),
870 Err(e) => {
871 log::warn!("Failed to decompress posting list for trigram {}: {}", trigram, e);
872 return vec![];
873 }
874 }
875 }
876 Err(_) => {
877 return vec![];
879 }
880 }
881 }
882
883 if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
884 return vec![];
885 }
886
887 posting_lists.sort_by_key(|list| list.len());
889
890 intersect_by_file_owned(&posting_lists)
892 } else {
893 let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
895 .iter()
896 .filter_map(|t| {
897 self.index
898 .binary_search_by_key(t, |(trigram, _)| *trigram)
899 .ok()
900 .map(|idx| &self.index[idx].1)
901 })
902 .collect();
903
904 if posting_lists.is_empty() {
905 return vec![];
906 }
907
908 if posting_lists.len() < trigrams.len() {
909 return vec![];
911 }
912
913 posting_lists.sort_by_key(|list| list.len());
915
916 intersect_by_file(&posting_lists)
918 }
919 }
920
921 pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
923 self.index
924 .binary_search_by_key(&trigram, |(t, _)| *t)
925 .ok()
926 .map(|idx| &self.index[idx].1)
927 }
928
929 pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
943 let path = path.as_ref();
944
945 if !self.partial_indices.is_empty() {
947 log::info!("Using streaming merge to write {} partial indices", self.partial_indices.len());
948 return self.merge_partial_indices_to_file(path);
949 }
950
951 let file = OpenOptions::new()
956 .create(true)
957 .write(true)
958 .truncate(true)
959 .open(path)
960 .with_context(|| format!("Failed to create {}", path.display()))?;
961
962 let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
964
965 writer.write_all(MAGIC)?;
967 writer.write_all(&VERSION.to_le_bytes())?;
968 writer.write_all(&(self.index.len() as u64).to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
973
974 let directory_start = HEADER_SIZE as u64;
976 let directory_size = self.index.len() * 16;
977
978 let data_start = directory_start + directory_size as u64;
980 let mut current_offset = data_start;
981
982 let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
988
989 for (trigram, locations) in &self.index {
990 let mut compressed = Vec::new();
992 let mut prev_file_id = 0u32;
993 let mut prev_line_no = 0u32;
994 let mut prev_byte_offset = 0u32;
995
996 for loc in locations {
997 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
998 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
999 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
1000
1001 write_varint(&mut compressed, file_id_delta)?;
1002 write_varint(&mut compressed, line_no_delta)?;
1003 write_varint(&mut compressed, byte_offset_delta)?;
1004
1005 prev_file_id = loc.file_id;
1006 prev_line_no = loc.line_no;
1007 prev_byte_offset = loc.byte_offset;
1008 }
1009
1010 directory.push(DirectoryEntry {
1011 trigram: *trigram,
1012 data_offset: current_offset,
1013 compressed_size: compressed.len() as u32,
1014 });
1015 current_offset += compressed.len() as u64;
1016
1017 compressed_lists.push((*trigram, compressed));
1018 }
1019
1020 for entry in &directory {
1022 writer.write_all(&entry.trigram.to_le_bytes())?;
1023 writer.write_all(&entry.data_offset.to_le_bytes())?;
1024 writer.write_all(&entry.compressed_size.to_le_bytes())?;
1025 }
1026
1027 for (_, compressed) in &compressed_lists {
1029 writer.write_all(compressed)?;
1030 }
1031
1032 for file_path in &self.files {
1034 let path_str = file_path.to_string_lossy();
1035 let path_bytes = path_str.as_bytes();
1036 write_varint(&mut writer, path_bytes.len() as u32)?;
1037 writer.write_all(path_bytes)?;
1038 }
1039
1040 writer.flush()?;
1042 writer.get_ref().sync_all()?;
1043
1044 log::info!(
1045 "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1046 self.index.len(),
1047 self.files.len(),
1048 path
1049 );
1050
1051 Ok(())
1052 }
1053
1054 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1059 let path = path.as_ref();
1060
1061 let file = File::open(path)
1062 .with_context(|| format!("Failed to open {}", path.display()))?;
1063
1064 let mmap = unsafe {
1066 memmap2::Mmap::map(&file)
1067 .with_context(|| format!("Failed to mmap {}", path.display()))?
1068 };
1069
1070 if mmap.len() < HEADER_SIZE {
1072 anyhow::bail!("trigrams.bin too small (expected at least {} bytes)", HEADER_SIZE);
1073 }
1074
1075 if &mmap[0..4] != MAGIC {
1076 anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1077 }
1078
1079 let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1080 if version != VERSION {
1081 anyhow::bail!(
1082 "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1083 version, VERSION
1084 );
1085 }
1086
1087 let num_trigrams = u64::from_le_bytes([
1088 mmap[8], mmap[9], mmap[10], mmap[11],
1089 mmap[12], mmap[13], mmap[14], mmap[15],
1090 ]) as usize;
1091
1092 let num_files = u64::from_le_bytes([
1093 mmap[16], mmap[17], mmap[18], mmap[19],
1094 mmap[20], mmap[21], mmap[22], mmap[23],
1095 ]) as usize;
1096
1097 log::debug!("Loading lazy trigram index: {} trigrams, {} files", num_trigrams, num_files);
1098
1099 let mut directory = Vec::with_capacity(num_trigrams);
1101 let mut pos = HEADER_SIZE;
1102 let directory_size = num_trigrams * 16; for _ in 0..num_trigrams {
1105 if pos + 16 > mmap.len() {
1106 anyhow::bail!("Truncated directory entry at pos={}", pos);
1107 }
1108
1109 let trigram = u32::from_le_bytes([
1110 mmap[pos],
1111 mmap[pos + 1],
1112 mmap[pos + 2],
1113 mmap[pos + 3],
1114 ]);
1115 pos += 4;
1116
1117 let data_offset = u64::from_le_bytes([
1118 mmap[pos],
1119 mmap[pos + 1],
1120 mmap[pos + 2],
1121 mmap[pos + 3],
1122 mmap[pos + 4],
1123 mmap[pos + 5],
1124 mmap[pos + 6],
1125 mmap[pos + 7],
1126 ]);
1127 pos += 8;
1128
1129 let compressed_size = u32::from_le_bytes([
1130 mmap[pos],
1131 mmap[pos + 1],
1132 mmap[pos + 2],
1133 mmap[pos + 3],
1134 ]);
1135 pos += 4;
1136
1137 directory.push(DirectoryEntry {
1138 trigram,
1139 data_offset,
1140 compressed_size,
1141 });
1142 }
1143
1144 directory.sort_unstable_by_key(|e| e.trigram);
1146
1147 let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1149 let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1150 pos = files_section_offset;
1151
1152 let mut files = Vec::with_capacity(num_files);
1154 for _ in 0..num_files {
1155 let (path_len, consumed) = read_varint(&mmap[pos..])?;
1157 pos += consumed;
1158 let path_len = path_len as usize;
1159
1160 if pos + path_len > mmap.len() {
1161 anyhow::bail!("Truncated file path at pos={}", pos);
1162 }
1163
1164 let path_bytes = &mmap[pos..pos + path_len];
1165 let path_str = std::str::from_utf8(path_bytes)
1166 .context("Invalid UTF-8 in file path")?;
1167 files.push(PathBuf::from(path_str));
1168 pos += path_len;
1169 }
1170
1171 log::info!(
1172 "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1173 num_trigrams,
1174 num_files,
1175 directory_size / 1024
1176 );
1177
1178 Ok(Self {
1179 index: Vec::new(), files,
1181 temp_index: None,
1182 mmap: Some(mmap), directory,
1184 partial_indices: Vec::new(),
1185 temp_dir: None,
1186 max_posting_list_entries: 0,
1187 })
1188 }
1189}
1190
1191impl Default for TrigramIndex {
1192 fn default() -> Self {
1193 Self::new()
1194 }
1195}
1196
1197pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1201 let bytes = text.as_bytes();
1202 let mut trigrams = Vec::new();
1203
1204 for i in 0..bytes.len().saturating_sub(2) {
1205 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1206 trigrams.push(trigram);
1207 }
1208
1209 trigrams
1210}
1211
1212pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1216 let bytes = text.as_bytes();
1217 let mut result = Vec::new();
1218
1219 let mut line_no = 1;
1220
1221 for (i, &byte) in bytes.iter().enumerate() {
1222 if byte == b'\n' {
1224 line_no += 1;
1225 }
1226
1227 if i + 2 < bytes.len() {
1229 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1230 let location = FileLocation::new(file_id, line_no, i as u32);
1231 result.push((trigram, location));
1232 }
1233 }
1234
1235 result
1236}
1237
1238#[inline]
1240fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1241 debug_assert_eq!(bytes.len(), 3);
1242 (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1243}
1244
1245#[allow(dead_code)]
1247fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1248 [
1249 ((trigram >> 16) & 0xFF) as u8,
1250 ((trigram >> 8) & 0xFF) as u8,
1251 (trigram & 0xFF) as u8,
1252 ]
1253}
1254
1255fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1260 if lists.is_empty() {
1261 return vec![];
1262 }
1263
1264 use std::collections::HashSet;
1265
1266 let mut candidates: HashSet<(u32, u32)> = lists[0]
1268 .iter()
1269 .map(|loc| (loc.file_id, loc.line_no))
1270 .collect();
1271
1272 for &list in &lists[1..] {
1274 let list_pairs: HashSet<(u32, u32)> = list
1275 .iter()
1276 .map(|loc| (loc.file_id, loc.line_no))
1277 .collect();
1278 candidates.retain(|pair| list_pairs.contains(pair));
1279 }
1280
1281 let mut result = Vec::new();
1283 for &(file_id, line_no) in &candidates {
1284 if let Some(loc) = lists[0]
1286 .iter()
1287 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1288 {
1289 result.push(*loc);
1290 }
1291 }
1292
1293 result.sort_unstable();
1294 result
1295}
1296
1297fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1304 if lists.is_empty() {
1305 return vec![];
1306 }
1307
1308 use std::collections::HashSet;
1309
1310 let mut candidates: HashSet<(u32, u32)> = lists[0]
1312 .iter()
1313 .map(|loc| (loc.file_id, loc.line_no))
1314 .collect();
1315
1316 for list in &lists[1..] {
1318 let list_pairs: HashSet<(u32, u32)> = list
1319 .iter()
1320 .map(|loc| (loc.file_id, loc.line_no))
1321 .collect();
1322 candidates.retain(|pair| list_pairs.contains(pair));
1323 }
1324
1325 let mut result = Vec::new();
1327 for &(file_id, line_no) in &candidates {
1328 if let Some(loc) = lists[0]
1330 .iter()
1331 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1332 {
1333 result.push(*loc);
1334 }
1335 }
1336
1337 result.sort_unstable();
1338 result
1339}
1340
1341#[cfg(test)]
1342mod tests {
1343 use super::*;
1344
1345 #[test]
1346 fn test_extract_trigrams() {
1347 let text = "hello";
1348 let trigrams = extract_trigrams(text);
1349
1350 assert_eq!(trigrams.len(), 3);
1352
1353 let expected = vec![
1355 bytes_to_trigram(b"hel"),
1356 bytes_to_trigram(b"ell"),
1357 bytes_to_trigram(b"llo"),
1358 ];
1359 assert_eq!(trigrams, expected);
1360 }
1361
1362 #[test]
1363 fn test_extract_trigrams_short() {
1364 assert_eq!(extract_trigrams("ab").len(), 0);
1365 assert_eq!(extract_trigrams("abc").len(), 1);
1366 }
1367
1368 #[test]
1369 fn test_bytes_to_trigram() {
1370 let trigram1 = bytes_to_trigram(b"abc");
1371 let trigram2 = bytes_to_trigram(b"abc");
1372 let trigram3 = bytes_to_trigram(b"xyz");
1373
1374 assert_eq!(trigram1, trigram2);
1375 assert_ne!(trigram1, trigram3);
1376 }
1377
1378 #[test]
1379 fn test_trigram_roundtrip() {
1380 let original = b"foo";
1381 let trigram = bytes_to_trigram(original);
1382 let recovered = trigram_to_bytes(trigram);
1383 assert_eq!(original, &recovered);
1384 }
1385
1386 #[test]
1387 fn test_extract_with_locations() {
1388 let text = "hello\nworld";
1389 let locs = extract_trigrams_with_locations(text, 0);
1390
1391 assert_eq!(locs.len(), 9);
1394
1395 assert_eq!(locs[0].1.line_no, 1);
1397
1398 let world_start = text.find("world").unwrap();
1400 let world_trigram_idx = locs
1401 .iter()
1402 .position(|(_, loc)| loc.byte_offset as usize == world_start)
1403 .unwrap();
1404 assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1405 }
1406
1407 #[test]
1408 fn test_trigram_index_basic() {
1409 let mut index = TrigramIndex::new();
1410
1411 let file_id = index.add_file(PathBuf::from("test.txt"));
1412 index.index_file(file_id, "hello world");
1413 index.finalize();
1414
1415 let results = index.search("hello");
1417 assert!(!results.is_empty());
1418
1419 let results = index.search("world");
1421 assert!(!results.is_empty());
1422
1423 let results = index.search("goodbye");
1425 assert!(results.is_empty());
1426 }
1427
1428 #[test]
1429 fn test_search_multifile() {
1430 let mut index = TrigramIndex::new();
1431
1432 let file1 = index.add_file(PathBuf::from("file1.txt"));
1433 let file2 = index.add_file(PathBuf::from("file2.txt"));
1434
1435 index.index_file(file1, "extract_symbols is here");
1436 index.index_file(file2, "extract_symbols is also here");
1437 index.finalize();
1438
1439 let results = index.search("extract_symbols");
1440 assert_eq!(results.len(), 2); let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1444 assert!(file_ids.contains(&file1));
1445 assert!(file_ids.contains(&file2));
1446 }
1447
1448 #[test]
1449 fn test_persistence_write() {
1450 use tempfile::TempDir;
1451
1452 let temp = TempDir::new().unwrap();
1453 let trigrams_path = temp.path().join("trigrams.bin");
1454
1455 let mut index = TrigramIndex::new();
1457 let file1 = index.add_file(PathBuf::from("src/main.rs"));
1458 let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1459
1460 index.index_file(file1, "fn main() { println!(\"hello\"); }");
1461 index.index_file(file2, "pub fn hello() -> String { String::from(\"hello\") }");
1462 index.finalize();
1463
1464 index.write(&trigrams_path).unwrap();
1466
1467 assert!(trigrams_path.exists());
1469
1470 let metadata = std::fs::metadata(&trigrams_path).unwrap();
1472 assert!(metadata.len() > HEADER_SIZE as u64);
1473
1474 use std::io::Read;
1476 let mut file = File::open(&trigrams_path).unwrap();
1477 let mut magic = [0u8; 4];
1478 file.read_exact(&mut magic).unwrap();
1479 assert_eq!(&magic, MAGIC);
1480
1481 }
1484
1485 #[test]
1486 fn test_posting_list_cap_enforced() {
1487 let cap: usize = 10;
1488 let content = "aaa ".repeat(200);
1489 let mut index = TrigramIndex::new();
1490 index.set_max_posting_list_entries(cap);
1491 let file_id = index.add_file(PathBuf::from("dense.txt"));
1492 index.index_file(file_id, &content);
1493 index.finalize();
1494 let aaa = bytes_to_trigram(b"aaa");
1495 let list = index.get_posting_list(aaa).expect("aaa trigram should exist");
1496 assert!(list.len() <= cap, "cap exceeded: {} > {}", list.len(), cap);
1497 }
1498
1499 #[test]
1500 fn test_posting_list_cap_zero_means_unlimited() {
1501 let repetitions = 50;
1502 let content = "aaa ".repeat(repetitions);
1503 let mut index = TrigramIndex::new();
1504 index.set_max_posting_list_entries(0);
1505 let file_id = index.add_file(PathBuf::from("dense.txt"));
1506 index.index_file(file_id, &content);
1507 index.finalize();
1508 let aaa = bytes_to_trigram(b"aaa");
1509 let list = index.get_posting_list(aaa).expect("aaa trigram should exist");
1510 assert!(list.len() >= repetitions, "expected >= {} entries, got {}", repetitions, list.len());
1511 }
1512}