1use anyhow::{Context, Result};
17use rkyv::{Archive, Deserialize, Serialize};
18use std::collections::HashMap;
19use std::fs::{File, OpenOptions};
20use std::io::Write;
21use std::path::{Path, PathBuf};
22
23pub type Trigram = u32;
25
26const MAGIC: &[u8; 4] = b"RFTG"; const VERSION: u32 = 3; #[allow(dead_code)]
31const HEADER_SIZE: usize = 24;
32
33fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
36 loop {
37 let mut byte = (value & 0x7F) as u8;
38 value >>= 7;
39 if value != 0 {
40 byte |= 0x80; }
42 writer.write_all(&[byte])?;
43 if value == 0 {
44 break;
45 }
46 }
47 Ok(())
48}
49
50fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
52 let mut value: u32 = 0;
53 let mut shift = 0;
54 let mut pos = 0;
55
56 loop {
57 if pos >= data.len() {
58 anyhow::bail!("Truncated varint");
59 }
60 let byte = data[pos];
61 pos += 1;
62
63 value |= ((byte & 0x7F) as u32) << shift;
64 if byte & 0x80 == 0 {
65 break;
66 }
67 shift += 7;
68 if shift >= 32 {
69 anyhow::bail!("Varint too large");
70 }
71 }
72
73 Ok((value, pos))
74}
75
76fn decompress_posting_list(
86 mmap: &[u8],
87 offset: u64,
88 size: u32,
89) -> Result<Vec<FileLocation>> {
90 let start = offset as usize;
91 let end = start + size as usize;
92
93 if end > mmap.len() {
94 anyhow::bail!(
95 "Posting list out of bounds: offset={}, size={}, mmap_len={}",
96 offset,
97 size,
98 mmap.len()
99 );
100 }
101
102 let compressed_data = &mmap[start..end];
103
104 let mut locations = Vec::new();
106 let mut pos = 0;
107 let mut prev_file_id = 0u32;
108 let mut prev_line_no = 0u32;
109 let mut prev_byte_offset = 0u32;
110
111 while pos < compressed_data.len() {
112 let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
114 pos += consumed;
115
116 let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
118 pos += consumed;
119
120 let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
122 pos += consumed;
123
124 let file_id = prev_file_id.wrapping_add(file_id_delta);
126 let line_no = prev_line_no.wrapping_add(line_no_delta);
127 let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
128
129 locations.push(FileLocation {
130 file_id,
131 line_no,
132 byte_offset,
133 });
134
135 prev_file_id = file_id;
137 prev_line_no = line_no;
138 prev_byte_offset = byte_offset;
139 }
140
141 Ok(locations)
142}
143
144#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Archive, Serialize, Deserialize)]
146pub struct FileLocation {
147 pub file_id: u32,
149 pub line_no: u32,
151 pub byte_offset: u32,
153}
154
155impl FileLocation {
156 pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
157 Self {
158 file_id,
159 line_no,
160 byte_offset,
161 }
162 }
163}
164
165#[derive(Archive, Serialize, Deserialize)]
167struct TrigramData {
168 index: Vec<(Trigram, Vec<FileLocation>)>,
170 files: Vec<String>,
172}
173
174#[derive(Debug, Clone)]
179struct DirectoryEntry {
180 trigram: Trigram,
182 data_offset: u64,
184 compressed_size: u32,
186}
187
188pub struct TrigramIndex {
199 index: Vec<(Trigram, Vec<FileLocation>)>,
202 files: Vec<PathBuf>,
204 temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
206 mmap: Option<memmap2::Mmap>,
208 directory: Vec<DirectoryEntry>,
210 partial_indices: Vec<PathBuf>,
212 temp_dir: Option<PathBuf>,
214}
215
216impl TrigramIndex {
217 pub fn new() -> Self {
219 Self {
220 index: Vec::new(),
221 files: Vec::new(),
222 temp_index: Some(HashMap::new()),
223 mmap: None,
224 directory: Vec::new(),
225 partial_indices: Vec::new(),
226 temp_dir: None,
227 }
228 }
229
230 pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
235 std::fs::create_dir_all(&temp_dir)
236 .context("Failed to create temp directory for batch flushing")?;
237 self.temp_dir = Some(temp_dir);
238 log::info!("Enabled batch-flush mode for trigram index");
239 Ok(())
240 }
241
242 pub fn flush_batch(&mut self) -> Result<()> {
247 let temp_dir = self.temp_dir.as_ref()
248 .ok_or_else(|| anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first"))?;
249
250 let temp_map = self.temp_index.take()
252 .ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
253
254 if temp_map.is_empty() {
255 self.temp_index = Some(HashMap::new());
257 return Ok(());
258 }
259
260 let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
262
263 for (_, list) in partial_index.iter_mut() {
265 list.sort_unstable();
266 list.dedup();
267 }
268
269 partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
271
272 let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
274 self.write_partial_index(&partial_file, &partial_index)?;
275
276 self.partial_indices.push(partial_file);
277
278 self.temp_index = Some(HashMap::new());
280
281 log::debug!(
282 "Flushed batch {} with {} trigrams to disk",
283 self.partial_indices.len(),
284 partial_index.len()
285 );
286
287 Ok(())
288 }
289
290 fn write_partial_index(
292 &self,
293 path: &Path,
294 index: &[(Trigram, Vec<FileLocation>)],
295 ) -> Result<()> {
296 use std::io::BufWriter;
297
298 let file = OpenOptions::new()
299 .create(true)
300 .write(true)
301 .truncate(true)
302 .open(path)?;
303
304 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
305
306 writer.write_all(&(index.len() as u64).to_le_bytes())?;
308
309 for (trigram, locations) in index {
311 writer.write_all(&trigram.to_le_bytes())?;
312 writer.write_all(&(locations.len() as u32).to_le_bytes())?;
313
314 for loc in locations {
315 writer.write_all(&loc.file_id.to_le_bytes())?;
316 writer.write_all(&loc.line_no.to_le_bytes())?;
317 writer.write_all(&loc.byte_offset.to_le_bytes())?;
318 }
319 }
320
321 writer.flush()?;
322 Ok(())
323 }
324
325 pub fn add_file(&mut self, path: PathBuf) -> u32 {
327 let file_id = self.files.len() as u32;
328 self.files.push(path);
329 file_id
330 }
331
332 pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
334 self.files.get(file_id as usize)
335 }
336
337 pub fn file_count(&self) -> usize {
339 self.files.len()
340 }
341
342 pub fn trigram_count(&self) -> usize {
344 if !self.directory.is_empty() {
345 self.directory.len()
347 } else {
348 self.index.len()
350 }
351 }
352
353 pub fn index_file(&mut self, file_id: u32, content: &str) {
358 let trigrams = extract_trigrams_with_locations(content, file_id);
359
360 if let Some(ref mut temp_map) = self.temp_index {
362 for (trigram, location) in trigrams {
363 temp_map
364 .entry(trigram)
365 .or_insert_with(Vec::new)
366 .push(location);
367 }
368 } else {
369 panic!("Cannot call index_file() after finalize(). Index is read-only.");
370 }
371 }
372
373 pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
378 let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
379
380 for (trigram, location) in trigrams {
382 temp_map
383 .entry(trigram)
384 .or_insert_with(Vec::new)
385 .push(location);
386 }
387
388 self.index = temp_map.into_iter().collect();
390
391 self.temp_index = None;
393
394 self.finalize();
396 }
397
398 pub fn finalize(&mut self) {
406 if !self.partial_indices.is_empty() {
409 log::info!("Deferring finalization - will stream merge {} partial indices during write()",
410 self.partial_indices.len());
411
412 if let Some(ref temp_map) = self.temp_index {
414 if !temp_map.is_empty() {
415 self.flush_batch().expect("Failed to flush final batch");
416 }
417 }
418
419 return;
421 }
422
423 if let Some(temp_map) = self.temp_index.take() {
426 self.index = temp_map.into_iter().collect();
427 }
428
429 for (_, list) in self.index.iter_mut() {
431 list.sort_unstable();
432 list.dedup(); }
434
435 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
437 }
438
439 fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
447 use std::io::{BufReader, BufWriter, Read};
448 use std::cmp::Ordering;
449 use std::collections::BinaryHeap;
450
451 log::info!("Streaming merge of {} partial indices to {:?}",
452 self.partial_indices.len(), output_path);
453
454 struct PartialIndexReader {
456 reader: BufReader<File>,
457 current_trigram: Option<Trigram>,
458 current_posting_list: Vec<FileLocation>,
459 reader_id: usize,
460 }
461
462 let mut readers: Vec<PartialIndexReader> = Vec::new();
463
464 for (idx, partial_path) in self.partial_indices.iter().enumerate() {
465 let file = File::open(partial_path)
466 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
467 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
468
469 let mut buf = [0u8; 8];
471 reader.read_exact(&mut buf)?;
472
473 readers.push(PartialIndexReader {
474 reader,
475 current_trigram: None,
476 current_posting_list: Vec::new(),
477 reader_id: idx,
478 });
479 }
480
481 fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
483 let mut trigram_buf = [0u8; 4];
485 match reader.reader.read_exact(&mut trigram_buf) {
486 Ok(_) => {
487 let trigram = u32::from_le_bytes(trigram_buf);
488
489 let mut len_buf = [0u8; 4];
491 reader.reader.read_exact(&mut len_buf)?;
492 let list_len = u32::from_le_bytes(len_buf) as usize;
493
494 let mut locations = Vec::with_capacity(list_len);
496 for _ in 0..list_len {
497 let mut loc_buf = [0u8; 12];
498 reader.reader.read_exact(&mut loc_buf)?;
499
500 let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
501 let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
502 let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
503
504 locations.push(FileLocation { file_id, line_no, byte_offset });
505 }
506
507 reader.current_trigram = Some(trigram);
508 reader.current_posting_list = locations;
509 Ok(true)
510 }
511 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
512 reader.current_trigram = None;
513 Ok(false)
514 }
515 Err(e) => Err(e.into()),
516 }
517 }
518
519 for reader in &mut readers {
521 read_next_trigram(reader)?;
522 }
523
524 #[derive(Eq, PartialEq)]
526 struct HeapEntry {
527 trigram: Trigram,
528 reader_id: usize,
529 }
530
531 impl Ord for HeapEntry {
532 fn cmp(&self, other: &Self) -> Ordering {
533 other.trigram.cmp(&self.trigram)
535 .then_with(|| other.reader_id.cmp(&self.reader_id))
536 }
537 }
538
539 impl PartialOrd for HeapEntry {
540 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
541 Some(self.cmp(other))
542 }
543 }
544
545 let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
547 for reader in &readers {
548 if let Some(trigram) = reader.current_trigram {
549 heap.push(HeapEntry {
550 trigram,
551 reader_id: reader.reader_id,
552 });
553 }
554 }
555
556 let file = OpenOptions::new()
558 .create(true)
559 .write(true)
560 .truncate(true)
561 .open(output_path)
562 .with_context(|| format!("Failed to create {}", output_path.display()))?;
563
564 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
565
566 writer.write_all(MAGIC)?;
568 writer.write_all(&VERSION.to_le_bytes())?;
569 writer.write_all(&0u64.to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::new();
574 let mut num_trigrams = 0u64;
575
576 let mut current_trigram: Option<Trigram> = None;
578 let mut merged_locations: Vec<FileLocation> = Vec::new();
579
580 while let Some(entry) = heap.pop() {
581 let reader = &mut readers[entry.reader_id];
582
583 if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
585 let trigram = current_trigram.unwrap();
587 merged_locations.sort_unstable();
588 merged_locations.dedup();
589
590 let data_offset = writer.stream_position()?;
592 let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
593
594 directory.push(DirectoryEntry {
595 trigram,
596 data_offset,
597 compressed_size,
598 });
599
600 num_trigrams += 1;
601 merged_locations.clear();
602 }
603
604 current_trigram = Some(entry.trigram);
606
607 merged_locations.extend_from_slice(&reader.current_posting_list);
609
610 if read_next_trigram(reader)? {
612 if let Some(next_trigram) = reader.current_trigram {
613 heap.push(HeapEntry {
614 trigram: next_trigram,
615 reader_id: entry.reader_id,
616 });
617 }
618 }
619 }
620
621 if let Some(trigram) = current_trigram {
623 merged_locations.sort_unstable();
624 merged_locations.dedup();
625
626 let data_offset = writer.stream_position()?;
627 let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
628
629 directory.push(DirectoryEntry {
630 trigram,
631 data_offset,
632 compressed_size,
633 });
634
635 num_trigrams += 1;
636 }
637
638 log::info!("Merged {} trigrams from {} partial indices", num_trigrams, self.partial_indices.len());
639
640 let _data_end_pos = writer.stream_position()?;
642
643 for file_path in &self.files {
645 let path_str = file_path.to_string_lossy();
646 let path_bytes = path_str.as_bytes();
647 write_varint(&mut writer, path_bytes.len() as u32)?;
648 writer.write_all(path_bytes)?;
649 }
650
651 writer.flush()?;
653 drop(writer);
654
655 use std::io::{Seek, SeekFrom};
658
659 let mut temp_data = Vec::new();
661 {
662 let mut file = File::open(output_path)?;
663 file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
664 file.read_to_end(&mut temp_data)?;
665 }
666
667 let file = OpenOptions::new().write(true).truncate(true).open(output_path)?;
669 let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
670
671 writer.write_all(MAGIC)?;
673 writer.write_all(&VERSION.to_le_bytes())?;
674 writer.write_all(&num_trigrams.to_le_bytes())?;
675 writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
676
677 for entry in &directory {
679 writer.write_all(&entry.trigram.to_le_bytes())?;
680 let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
682 writer.write_all(&adjusted_offset.to_le_bytes())?;
683 writer.write_all(&entry.compressed_size.to_le_bytes())?;
684 }
685
686 writer.write_all(&temp_data)?;
688
689 writer.flush()?;
691 writer.get_ref().sync_all()?;
692
693 for partial_path in &self.partial_indices {
695 let _ = std::fs::remove_file(partial_path);
696 }
697 if let Some(ref temp_dir) = self.temp_dir {
698 let _ = std::fs::remove_dir(temp_dir);
699 }
700
701 log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
702
703 Ok(())
704 }
705
706 fn write_compressed_posting_list(
708 &self,
709 writer: &mut impl Write,
710 locations: &[FileLocation],
711 ) -> Result<u32> {
712 let mut compressed = Vec::new();
713
714 let mut prev_file_id = 0u32;
716 let mut prev_line_no = 0u32;
717 let mut prev_byte_offset = 0u32;
718
719 for loc in locations {
720 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
722 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
723 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
724
725 write_varint(&mut compressed, file_id_delta)?;
727 write_varint(&mut compressed, line_no_delta)?;
728 write_varint(&mut compressed, byte_offset_delta)?;
729
730 prev_file_id = loc.file_id;
732 prev_line_no = loc.line_no;
733 prev_byte_offset = loc.byte_offset;
734 }
735
736 let compressed_size = compressed.len() as u32;
737 writer.write_all(&compressed)?;
738
739 Ok(compressed_size)
740 }
741
742 #[allow(dead_code)]
744 fn merge_partial_indices(&mut self) -> Result<()> {
745 use std::io::{BufReader, Read};
746
747 let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
749
750 for partial_path in &self.partial_indices {
751 let file = File::open(partial_path)
752 .with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
753 let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
754
755 let mut buf = [0u8; 8];
757 reader.read_exact(&mut buf)?;
758 let num_trigrams = u64::from_le_bytes(buf) as usize;
759
760 for _ in 0..num_trigrams {
762 let mut trigram_buf = [0u8; 4];
764 reader.read_exact(&mut trigram_buf)?;
765 let trigram = u32::from_le_bytes(trigram_buf);
766
767 let mut len_buf = [0u8; 4];
769 reader.read_exact(&mut len_buf)?;
770 let list_len = u32::from_le_bytes(len_buf) as usize;
771
772 for _ in 0..list_len {
774 let mut loc_buf = [0u8; 12]; reader.read_exact(&mut loc_buf)?;
776
777 let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
778 let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
779 let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
780
781 all_entries.push((trigram, FileLocation { file_id, line_no, byte_offset }));
782 }
783 }
784 }
785
786 log::info!("Read {} total trigram entries from {} partial indices",
787 all_entries.len(), self.partial_indices.len());
788
789 let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
791 for (trigram, location) in all_entries {
792 index_map
793 .entry(trigram)
794 .or_insert_with(Vec::new)
795 .push(location);
796 }
797
798 self.index = index_map.into_iter().collect();
800
801 for (_, list) in self.index.iter_mut() {
803 list.sort_unstable();
804 list.dedup();
805 }
806
807 self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
809
810 for partial_path in &self.partial_indices {
812 let _ = std::fs::remove_file(partial_path);
813 }
814 if let Some(ref temp_dir) = self.temp_dir {
815 let _ = std::fs::remove_dir(temp_dir);
816 }
817
818 log::info!("Merged into final index with {} trigrams", self.index.len());
819
820 Ok(())
821 }
822
823 pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
831 if pattern.len() < 3 {
832 return vec![];
834 }
835
836 let trigrams = extract_trigrams(pattern);
837 if trigrams.is_empty() {
838 return vec![];
839 }
840
841 if let Some(ref mmap) = self.mmap {
843 let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
845
846 for trigram in &trigrams {
847 match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
849 Ok(idx) => {
850 let entry = &self.directory[idx];
851 match decompress_posting_list(mmap, entry.data_offset, entry.compressed_size) {
853 Ok(locations) => posting_lists.push(locations),
854 Err(e) => {
855 log::warn!("Failed to decompress posting list for trigram {}: {}", trigram, e);
856 return vec![];
857 }
858 }
859 }
860 Err(_) => {
861 return vec![];
863 }
864 }
865 }
866
867 if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
868 return vec![];
869 }
870
871 posting_lists.sort_by_key(|list| list.len());
873
874 intersect_by_file_owned(&posting_lists)
876 } else {
877 let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
879 .iter()
880 .filter_map(|t| {
881 self.index
882 .binary_search_by_key(t, |(trigram, _)| *trigram)
883 .ok()
884 .map(|idx| &self.index[idx].1)
885 })
886 .collect();
887
888 if posting_lists.is_empty() {
889 return vec![];
890 }
891
892 if posting_lists.len() < trigrams.len() {
893 return vec![];
895 }
896
897 posting_lists.sort_by_key(|list| list.len());
899
900 intersect_by_file(&posting_lists)
902 }
903 }
904
905 pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
907 self.index
908 .binary_search_by_key(&trigram, |(t, _)| *t)
909 .ok()
910 .map(|idx| &self.index[idx].1)
911 }
912
913 pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
927 let path = path.as_ref();
928
929 if !self.partial_indices.is_empty() {
931 log::info!("Using streaming merge to write {} partial indices", self.partial_indices.len());
932 return self.merge_partial_indices_to_file(path);
933 }
934
935 let file = OpenOptions::new()
937 .create(true)
938 .write(true)
939 .truncate(true)
940 .open(path)
941 .with_context(|| format!("Failed to create {}", path.display()))?;
942
943 let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
945
946 writer.write_all(MAGIC)?;
948 writer.write_all(&VERSION.to_le_bytes())?;
949 writer.write_all(&(self.index.len() as u64).to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?; let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
954
955 let directory_start = HEADER_SIZE as u64;
957 let directory_size = self.index.len() * 16;
958
959 let data_start = directory_start + directory_size as u64;
961 let mut current_offset = data_start;
962
963 let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
969
970 for (trigram, locations) in &self.index {
971 let mut compressed = Vec::new();
973 let mut prev_file_id = 0u32;
974 let mut prev_line_no = 0u32;
975 let mut prev_byte_offset = 0u32;
976
977 for loc in locations {
978 let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
979 let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
980 let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
981
982 write_varint(&mut compressed, file_id_delta)?;
983 write_varint(&mut compressed, line_no_delta)?;
984 write_varint(&mut compressed, byte_offset_delta)?;
985
986 prev_file_id = loc.file_id;
987 prev_line_no = loc.line_no;
988 prev_byte_offset = loc.byte_offset;
989 }
990
991 directory.push(DirectoryEntry {
992 trigram: *trigram,
993 data_offset: current_offset,
994 compressed_size: compressed.len() as u32,
995 });
996 current_offset += compressed.len() as u64;
997
998 compressed_lists.push((*trigram, compressed));
999 }
1000
1001 for entry in &directory {
1003 writer.write_all(&entry.trigram.to_le_bytes())?;
1004 writer.write_all(&entry.data_offset.to_le_bytes())?;
1005 writer.write_all(&entry.compressed_size.to_le_bytes())?;
1006 }
1007
1008 for (_, compressed) in &compressed_lists {
1010 writer.write_all(compressed)?;
1011 }
1012
1013 for file_path in &self.files {
1015 let path_str = file_path.to_string_lossy();
1016 let path_bytes = path_str.as_bytes();
1017 write_varint(&mut writer, path_bytes.len() as u32)?;
1018 writer.write_all(path_bytes)?;
1019 }
1020
1021 writer.flush()?;
1023 writer.get_ref().sync_all()?;
1024
1025 log::info!(
1026 "Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
1027 self.index.len(),
1028 self.files.len(),
1029 path
1030 );
1031
1032 Ok(())
1033 }
1034
1035 pub fn load(path: impl AsRef<Path>) -> Result<Self> {
1040 let path = path.as_ref();
1041
1042 let file = File::open(path)
1043 .with_context(|| format!("Failed to open {}", path.display()))?;
1044
1045 let mmap = unsafe {
1047 memmap2::Mmap::map(&file)
1048 .with_context(|| format!("Failed to mmap {}", path.display()))?
1049 };
1050
1051 if mmap.len() < HEADER_SIZE {
1053 anyhow::bail!("trigrams.bin too small (expected at least {} bytes)", HEADER_SIZE);
1054 }
1055
1056 if &mmap[0..4] != MAGIC {
1057 anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
1058 }
1059
1060 let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
1061 if version != VERSION {
1062 anyhow::bail!(
1063 "Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
1064 version, VERSION
1065 );
1066 }
1067
1068 let num_trigrams = u64::from_le_bytes([
1069 mmap[8], mmap[9], mmap[10], mmap[11],
1070 mmap[12], mmap[13], mmap[14], mmap[15],
1071 ]) as usize;
1072
1073 let num_files = u64::from_le_bytes([
1074 mmap[16], mmap[17], mmap[18], mmap[19],
1075 mmap[20], mmap[21], mmap[22], mmap[23],
1076 ]) as usize;
1077
1078 log::debug!("Loading lazy trigram index: {} trigrams, {} files", num_trigrams, num_files);
1079
1080 let mut directory = Vec::with_capacity(num_trigrams);
1082 let mut pos = HEADER_SIZE;
1083 let directory_size = num_trigrams * 16; for _ in 0..num_trigrams {
1086 if pos + 16 > mmap.len() {
1087 anyhow::bail!("Truncated directory entry at pos={}", pos);
1088 }
1089
1090 let trigram = u32::from_le_bytes([
1091 mmap[pos],
1092 mmap[pos + 1],
1093 mmap[pos + 2],
1094 mmap[pos + 3],
1095 ]);
1096 pos += 4;
1097
1098 let data_offset = u64::from_le_bytes([
1099 mmap[pos],
1100 mmap[pos + 1],
1101 mmap[pos + 2],
1102 mmap[pos + 3],
1103 mmap[pos + 4],
1104 mmap[pos + 5],
1105 mmap[pos + 6],
1106 mmap[pos + 7],
1107 ]);
1108 pos += 8;
1109
1110 let compressed_size = u32::from_le_bytes([
1111 mmap[pos],
1112 mmap[pos + 1],
1113 mmap[pos + 2],
1114 mmap[pos + 3],
1115 ]);
1116 pos += 4;
1117
1118 directory.push(DirectoryEntry {
1119 trigram,
1120 data_offset,
1121 compressed_size,
1122 });
1123 }
1124
1125 directory.sort_unstable_by_key(|e| e.trigram);
1127
1128 let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
1130 let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
1131 pos = files_section_offset;
1132
1133 let mut files = Vec::with_capacity(num_files);
1135 for _ in 0..num_files {
1136 let (path_len, consumed) = read_varint(&mmap[pos..])?;
1138 pos += consumed;
1139 let path_len = path_len as usize;
1140
1141 if pos + path_len > mmap.len() {
1142 anyhow::bail!("Truncated file path at pos={}", pos);
1143 }
1144
1145 let path_bytes = &mmap[pos..pos + path_len];
1146 let path_str = std::str::from_utf8(path_bytes)
1147 .context("Invalid UTF-8 in file path")?;
1148 files.push(PathBuf::from(path_str));
1149 pos += path_len;
1150 }
1151
1152 log::info!(
1153 "Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
1154 num_trigrams,
1155 num_files,
1156 directory_size / 1024
1157 );
1158
1159 Ok(Self {
1160 index: Vec::new(), files,
1162 temp_index: None,
1163 mmap: Some(mmap), directory,
1165 partial_indices: Vec::new(),
1166 temp_dir: None,
1167 })
1168 }
1169}
1170
1171impl Default for TrigramIndex {
1172 fn default() -> Self {
1173 Self::new()
1174 }
1175}
1176
1177pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
1181 let bytes = text.as_bytes();
1182 let mut trigrams = Vec::new();
1183
1184 for i in 0..bytes.len().saturating_sub(2) {
1185 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1186 trigrams.push(trigram);
1187 }
1188
1189 trigrams
1190}
1191
1192pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
1196 let bytes = text.as_bytes();
1197 let mut result = Vec::new();
1198
1199 let mut line_no = 1;
1200
1201 for (i, &byte) in bytes.iter().enumerate() {
1202 if byte == b'\n' {
1204 line_no += 1;
1205 }
1206
1207 if i + 2 < bytes.len() {
1209 let trigram = bytes_to_trigram(&bytes[i..i + 3]);
1210 let location = FileLocation::new(file_id, line_no, i as u32);
1211 result.push((trigram, location));
1212 }
1213 }
1214
1215 result
1216}
1217
1218#[inline]
1220fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
1221 debug_assert_eq!(bytes.len(), 3);
1222 (bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
1223}
1224
1225#[allow(dead_code)]
1227fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
1228 [
1229 ((trigram >> 16) & 0xFF) as u8,
1230 ((trigram >> 8) & 0xFF) as u8,
1231 (trigram & 0xFF) as u8,
1232 ]
1233}
1234
1235fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
1240 if lists.is_empty() {
1241 return vec![];
1242 }
1243
1244 use std::collections::HashSet;
1245
1246 let mut candidates: HashSet<(u32, u32)> = lists[0]
1248 .iter()
1249 .map(|loc| (loc.file_id, loc.line_no))
1250 .collect();
1251
1252 for &list in &lists[1..] {
1254 let list_pairs: HashSet<(u32, u32)> = list
1255 .iter()
1256 .map(|loc| (loc.file_id, loc.line_no))
1257 .collect();
1258 candidates.retain(|pair| list_pairs.contains(pair));
1259 }
1260
1261 let mut result = Vec::new();
1263 for &(file_id, line_no) in &candidates {
1264 if let Some(loc) = lists[0]
1266 .iter()
1267 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1268 {
1269 result.push(*loc);
1270 }
1271 }
1272
1273 result.sort_unstable();
1274 result
1275}
1276
1277fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
1284 if lists.is_empty() {
1285 return vec![];
1286 }
1287
1288 use std::collections::HashSet;
1289
1290 let mut candidates: HashSet<(u32, u32)> = lists[0]
1292 .iter()
1293 .map(|loc| (loc.file_id, loc.line_no))
1294 .collect();
1295
1296 for list in &lists[1..] {
1298 let list_pairs: HashSet<(u32, u32)> = list
1299 .iter()
1300 .map(|loc| (loc.file_id, loc.line_no))
1301 .collect();
1302 candidates.retain(|pair| list_pairs.contains(pair));
1303 }
1304
1305 let mut result = Vec::new();
1307 for &(file_id, line_no) in &candidates {
1308 if let Some(loc) = lists[0]
1310 .iter()
1311 .find(|loc| loc.file_id == file_id && loc.line_no == line_no)
1312 {
1313 result.push(*loc);
1314 }
1315 }
1316
1317 result.sort_unstable();
1318 result
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323 use super::*;
1324
1325 #[test]
1326 fn test_extract_trigrams() {
1327 let text = "hello";
1328 let trigrams = extract_trigrams(text);
1329
1330 assert_eq!(trigrams.len(), 3);
1332
1333 let expected = vec![
1335 bytes_to_trigram(b"hel"),
1336 bytes_to_trigram(b"ell"),
1337 bytes_to_trigram(b"llo"),
1338 ];
1339 assert_eq!(trigrams, expected);
1340 }
1341
1342 #[test]
1343 fn test_extract_trigrams_short() {
1344 assert_eq!(extract_trigrams("ab").len(), 0);
1345 assert_eq!(extract_trigrams("abc").len(), 1);
1346 }
1347
1348 #[test]
1349 fn test_bytes_to_trigram() {
1350 let trigram1 = bytes_to_trigram(b"abc");
1351 let trigram2 = bytes_to_trigram(b"abc");
1352 let trigram3 = bytes_to_trigram(b"xyz");
1353
1354 assert_eq!(trigram1, trigram2);
1355 assert_ne!(trigram1, trigram3);
1356 }
1357
1358 #[test]
1359 fn test_trigram_roundtrip() {
1360 let original = b"foo";
1361 let trigram = bytes_to_trigram(original);
1362 let recovered = trigram_to_bytes(trigram);
1363 assert_eq!(original, &recovered);
1364 }
1365
1366 #[test]
1367 fn test_extract_with_locations() {
1368 let text = "hello\nworld";
1369 let locs = extract_trigrams_with_locations(text, 0);
1370
1371 assert_eq!(locs.len(), 9);
1374
1375 assert_eq!(locs[0].1.line_no, 1);
1377
1378 let world_start = text.find("world").unwrap();
1380 let world_trigram_idx = locs
1381 .iter()
1382 .position(|(_, loc)| loc.byte_offset as usize == world_start)
1383 .unwrap();
1384 assert_eq!(locs[world_trigram_idx].1.line_no, 2);
1385 }
1386
1387 #[test]
1388 fn test_trigram_index_basic() {
1389 let mut index = TrigramIndex::new();
1390
1391 let file_id = index.add_file(PathBuf::from("test.txt"));
1392 index.index_file(file_id, "hello world");
1393 index.finalize();
1394
1395 let results = index.search("hello");
1397 assert!(!results.is_empty());
1398
1399 let results = index.search("world");
1401 assert!(!results.is_empty());
1402
1403 let results = index.search("goodbye");
1405 assert!(results.is_empty());
1406 }
1407
1408 #[test]
1409 fn test_search_multifile() {
1410 let mut index = TrigramIndex::new();
1411
1412 let file1 = index.add_file(PathBuf::from("file1.txt"));
1413 let file2 = index.add_file(PathBuf::from("file2.txt"));
1414
1415 index.index_file(file1, "extract_symbols is here");
1416 index.index_file(file2, "extract_symbols is also here");
1417 index.finalize();
1418
1419 let results = index.search("extract_symbols");
1420 assert_eq!(results.len(), 2); let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
1424 assert!(file_ids.contains(&file1));
1425 assert!(file_ids.contains(&file2));
1426 }
1427
1428 #[test]
1429 fn test_persistence_write() {
1430 use tempfile::TempDir;
1431
1432 let temp = TempDir::new().unwrap();
1433 let trigrams_path = temp.path().join("trigrams.bin");
1434
1435 let mut index = TrigramIndex::new();
1437 let file1 = index.add_file(PathBuf::from("src/main.rs"));
1438 let file2 = index.add_file(PathBuf::from("src/lib.rs"));
1439
1440 index.index_file(file1, "fn main() { println!(\"hello\"); }");
1441 index.index_file(file2, "pub fn hello() -> String { String::from(\"hello\") }");
1442 index.finalize();
1443
1444 index.write(&trigrams_path).unwrap();
1446
1447 assert!(trigrams_path.exists());
1449
1450 let metadata = std::fs::metadata(&trigrams_path).unwrap();
1452 assert!(metadata.len() > HEADER_SIZE as u64);
1453
1454 use std::io::Read;
1456 let mut file = File::open(&trigrams_path).unwrap();
1457 let mut magic = [0u8; 4];
1458 file.read_exact(&mut magic).unwrap();
1459 assert_eq!(&magic, MAGIC);
1460
1461 }
1464}