use anyhow::{Context, Result};
use rkyv::{Archive, Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::path::{Path, PathBuf};
pub type Trigram = u32;
const MAGIC: &[u8; 4] = b"RFTG"; const VERSION: u32 = 3; #[allow(dead_code)]
const HEADER_SIZE: usize = 24;
fn write_varint(writer: &mut impl Write, mut value: u32) -> std::io::Result<()> {
loop {
let mut byte = (value & 0x7F) as u8;
value >>= 7;
if value != 0 {
byte |= 0x80; }
writer.write_all(&[byte])?;
if value == 0 {
break;
}
}
Ok(())
}
fn read_varint(data: &[u8]) -> Result<(u32, usize)> {
let mut value: u32 = 0;
let mut shift = 0;
let mut pos = 0;
loop {
if pos >= data.len() {
anyhow::bail!("Truncated varint");
}
let byte = data[pos];
pos += 1;
value |= ((byte & 0x7F) as u32) << shift;
if byte & 0x80 == 0 {
break;
}
shift += 7;
if shift >= 32 {
anyhow::bail!("Varint too large");
}
}
Ok((value, pos))
}
fn decompress_posting_list(
mmap: &[u8],
offset: u64,
size: u32,
) -> Result<Vec<FileLocation>> {
let start = offset as usize;
let end = start + size as usize;
if end > mmap.len() {
anyhow::bail!(
"Posting list out of bounds: offset={}, size={}, mmap_len={}",
offset,
size,
mmap.len()
);
}
let compressed_data = &mmap[start..end];
let mut locations = Vec::new();
let mut pos = 0;
let mut prev_file_id = 0u32;
let mut prev_line_no = 0u32;
let mut prev_byte_offset = 0u32;
while pos < compressed_data.len() {
let (file_id_delta, consumed) = read_varint(&compressed_data[pos..])?;
pos += consumed;
let (line_no_delta, consumed) = read_varint(&compressed_data[pos..])?;
pos += consumed;
let (byte_offset_delta, consumed) = read_varint(&compressed_data[pos..])?;
pos += consumed;
let file_id = prev_file_id.wrapping_add(file_id_delta);
let line_no = prev_line_no.wrapping_add(line_no_delta);
let byte_offset = prev_byte_offset.wrapping_add(byte_offset_delta);
locations.push(FileLocation {
file_id,
line_no,
byte_offset,
});
prev_file_id = file_id;
prev_line_no = line_no;
prev_byte_offset = byte_offset;
}
Ok(locations)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Archive, Serialize, Deserialize)]
pub struct FileLocation {
pub file_id: u32,
pub line_no: u32,
pub byte_offset: u32,
}
impl FileLocation {
pub fn new(file_id: u32, line_no: u32, byte_offset: u32) -> Self {
Self {
file_id,
line_no,
byte_offset,
}
}
}
#[derive(Archive, Serialize, Deserialize)]
struct TrigramData {
index: Vec<(Trigram, Vec<FileLocation>)>,
files: Vec<String>,
}
#[derive(Debug, Clone)]
struct DirectoryEntry {
trigram: Trigram,
data_offset: u64,
compressed_size: u32,
}
pub struct TrigramIndex {
index: Vec<(Trigram, Vec<FileLocation>)>,
files: Vec<PathBuf>,
temp_index: Option<HashMap<Trigram, Vec<FileLocation>>>,
mmap: Option<memmap2::Mmap>,
directory: Vec<DirectoryEntry>,
partial_indices: Vec<PathBuf>,
temp_dir: Option<PathBuf>,
}
impl TrigramIndex {
pub fn new() -> Self {
Self {
index: Vec::new(),
files: Vec::new(),
temp_index: Some(HashMap::new()),
mmap: None,
directory: Vec::new(),
partial_indices: Vec::new(),
temp_dir: None,
}
}
pub fn enable_batch_flush(&mut self, temp_dir: PathBuf) -> Result<()> {
std::fs::create_dir_all(&temp_dir)
.context("Failed to create temp directory for batch flushing")?;
self.temp_dir = Some(temp_dir);
log::info!("Enabled batch-flush mode for trigram index");
Ok(())
}
pub fn flush_batch(&mut self) -> Result<()> {
let temp_dir = self.temp_dir.as_ref()
.ok_or_else(|| anyhow::anyhow!("Batch flush not enabled - call enable_batch_flush() first"))?;
let temp_map = self.temp_index.take()
.ok_or_else(|| anyhow::anyhow!("No temp index to flush"))?;
if temp_map.is_empty() {
self.temp_index = Some(HashMap::new());
return Ok(());
}
let mut partial_index: Vec<(Trigram, Vec<FileLocation>)> = temp_map.into_iter().collect();
for (_, list) in partial_index.iter_mut() {
list.sort_unstable();
list.dedup();
}
partial_index.sort_unstable_by_key(|(trigram, _)| *trigram);
let partial_file = temp_dir.join(format!("partial_{}.bin", self.partial_indices.len()));
self.write_partial_index(&partial_file, &partial_index)?;
self.partial_indices.push(partial_file);
self.temp_index = Some(HashMap::new());
log::debug!(
"Flushed batch {} with {} trigrams to disk",
self.partial_indices.len(),
partial_index.len()
);
Ok(())
}
fn write_partial_index(
&self,
path: &Path,
index: &[(Trigram, Vec<FileLocation>)],
) -> Result<()> {
use std::io::BufWriter;
let file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(path)?;
let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
writer.write_all(&(index.len() as u64).to_le_bytes())?;
for (trigram, locations) in index {
writer.write_all(&trigram.to_le_bytes())?;
writer.write_all(&(locations.len() as u32).to_le_bytes())?;
for loc in locations {
writer.write_all(&loc.file_id.to_le_bytes())?;
writer.write_all(&loc.line_no.to_le_bytes())?;
writer.write_all(&loc.byte_offset.to_le_bytes())?;
}
}
writer.flush()?;
Ok(())
}
pub fn add_file(&mut self, path: PathBuf) -> u32 {
let file_id = self.files.len() as u32;
self.files.push(path);
file_id
}
pub fn get_file(&self, file_id: u32) -> Option<&PathBuf> {
self.files.get(file_id as usize)
}
pub fn file_count(&self) -> usize {
self.files.len()
}
pub fn trigram_count(&self) -> usize {
if !self.directory.is_empty() {
self.directory.len()
} else {
self.index.len()
}
}
pub fn index_file(&mut self, file_id: u32, content: &str) {
let trigrams = extract_trigrams_with_locations(content, file_id);
if let Some(ref mut temp_map) = self.temp_index {
for (trigram, location) in trigrams {
temp_map
.entry(trigram)
.or_insert_with(Vec::new)
.push(location);
}
} else {
panic!("Cannot call index_file() after finalize(). Index is read-only.");
}
}
pub fn build_from_trigrams(&mut self, trigrams: Vec<(Trigram, FileLocation)>) {
let mut temp_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
for (trigram, location) in trigrams {
temp_map
.entry(trigram)
.or_insert_with(Vec::new)
.push(location);
}
self.index = temp_map.into_iter().collect();
self.temp_index = None;
self.finalize();
}
pub fn finalize(&mut self) {
if !self.partial_indices.is_empty() {
log::info!("Deferring finalization - will stream merge {} partial indices during write()",
self.partial_indices.len());
if let Some(ref temp_map) = self.temp_index {
if !temp_map.is_empty() {
self.flush_batch().expect("Failed to flush final batch");
}
}
return;
}
if let Some(temp_map) = self.temp_index.take() {
self.index = temp_map.into_iter().collect();
}
for (_, list) in self.index.iter_mut() {
list.sort_unstable();
list.dedup(); }
self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
}
fn merge_partial_indices_to_file(&mut self, output_path: &Path) -> Result<()> {
use std::io::{BufReader, BufWriter, Read};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
log::info!("Streaming merge of {} partial indices to {:?}",
self.partial_indices.len(), output_path);
struct PartialIndexReader {
reader: BufReader<File>,
current_trigram: Option<Trigram>,
current_posting_list: Vec<FileLocation>,
reader_id: usize,
}
let mut readers: Vec<PartialIndexReader> = Vec::new();
for (idx, partial_path) in self.partial_indices.iter().enumerate() {
let file = File::open(partial_path)
.with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
let mut buf = [0u8; 8];
reader.read_exact(&mut buf)?;
readers.push(PartialIndexReader {
reader,
current_trigram: None,
current_posting_list: Vec::new(),
reader_id: idx,
});
}
fn read_next_trigram(reader: &mut PartialIndexReader) -> Result<bool> {
let mut trigram_buf = [0u8; 4];
match reader.reader.read_exact(&mut trigram_buf) {
Ok(_) => {
let trigram = u32::from_le_bytes(trigram_buf);
let mut len_buf = [0u8; 4];
reader.reader.read_exact(&mut len_buf)?;
let list_len = u32::from_le_bytes(len_buf) as usize;
let mut locations = Vec::with_capacity(list_len);
for _ in 0..list_len {
let mut loc_buf = [0u8; 12];
reader.reader.read_exact(&mut loc_buf)?;
let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
locations.push(FileLocation { file_id, line_no, byte_offset });
}
reader.current_trigram = Some(trigram);
reader.current_posting_list = locations;
Ok(true)
}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
reader.current_trigram = None;
Ok(false)
}
Err(e) => Err(e.into()),
}
}
for reader in &mut readers {
read_next_trigram(reader)?;
}
#[derive(Eq, PartialEq)]
struct HeapEntry {
trigram: Trigram,
reader_id: usize,
}
impl Ord for HeapEntry {
fn cmp(&self, other: &Self) -> Ordering {
other.trigram.cmp(&self.trigram)
.then_with(|| other.reader_id.cmp(&self.reader_id))
}
}
impl PartialOrd for HeapEntry {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
let mut heap: BinaryHeap<HeapEntry> = BinaryHeap::new();
for reader in &readers {
if let Some(trigram) = reader.current_trigram {
heap.push(HeapEntry {
trigram,
reader_id: reader.reader_id,
});
}
}
let file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(output_path)
.with_context(|| format!("Failed to create {}", output_path.display()))?;
let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
writer.write_all(MAGIC)?;
writer.write_all(&VERSION.to_le_bytes())?;
writer.write_all(&0u64.to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
let mut directory: Vec<DirectoryEntry> = Vec::new();
let mut num_trigrams = 0u64;
let mut current_trigram: Option<Trigram> = None;
let mut merged_locations: Vec<FileLocation> = Vec::new();
while let Some(entry) = heap.pop() {
let reader = &mut readers[entry.reader_id];
if current_trigram.is_some() && current_trigram != Some(entry.trigram) {
let trigram = current_trigram.unwrap();
merged_locations.sort_unstable();
merged_locations.dedup();
let data_offset = writer.stream_position()?;
let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
directory.push(DirectoryEntry {
trigram,
data_offset,
compressed_size,
});
num_trigrams += 1;
merged_locations.clear();
}
current_trigram = Some(entry.trigram);
merged_locations.extend_from_slice(&reader.current_posting_list);
if read_next_trigram(reader)? {
if let Some(next_trigram) = reader.current_trigram {
heap.push(HeapEntry {
trigram: next_trigram,
reader_id: entry.reader_id,
});
}
}
}
if let Some(trigram) = current_trigram {
merged_locations.sort_unstable();
merged_locations.dedup();
let data_offset = writer.stream_position()?;
let compressed_size = self.write_compressed_posting_list(&mut writer, &merged_locations)?;
directory.push(DirectoryEntry {
trigram,
data_offset,
compressed_size,
});
num_trigrams += 1;
}
log::info!("Merged {} trigrams from {} partial indices", num_trigrams, self.partial_indices.len());
let _data_end_pos = writer.stream_position()?;
for file_path in &self.files {
let path_str = file_path.to_string_lossy();
let path_bytes = path_str.as_bytes();
write_varint(&mut writer, path_bytes.len() as u32)?;
writer.write_all(path_bytes)?;
}
writer.flush()?;
drop(writer);
use std::io::{Seek, SeekFrom};
let mut temp_data = Vec::new();
{
let mut file = File::open(output_path)?;
file.seek(SeekFrom::Start(HEADER_SIZE as u64))?;
file.read_to_end(&mut temp_data)?;
}
let file = OpenOptions::new().write(true).truncate(true).open(output_path)?;
let mut writer = BufWriter::with_capacity(16 * 1024 * 1024, file);
writer.write_all(MAGIC)?;
writer.write_all(&VERSION.to_le_bytes())?;
writer.write_all(&num_trigrams.to_le_bytes())?;
writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
for entry in &directory {
writer.write_all(&entry.trigram.to_le_bytes())?;
let adjusted_offset = entry.data_offset + (directory.len() * 16) as u64;
writer.write_all(&adjusted_offset.to_le_bytes())?;
writer.write_all(&entry.compressed_size.to_le_bytes())?;
}
writer.write_all(&temp_data)?;
writer.flush()?;
writer.get_ref().sync_all()?;
for partial_path in &self.partial_indices {
let _ = std::fs::remove_file(partial_path);
}
if let Some(ref temp_dir) = self.temp_dir {
let _ = std::fs::remove_dir(temp_dir);
}
log::info!("Wrote {} trigrams to {:?}", num_trigrams, output_path);
Ok(())
}
fn write_compressed_posting_list(
&self,
writer: &mut impl Write,
locations: &[FileLocation],
) -> Result<u32> {
let mut compressed = Vec::new();
let mut prev_file_id = 0u32;
let mut prev_line_no = 0u32;
let mut prev_byte_offset = 0u32;
for loc in locations {
let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
write_varint(&mut compressed, file_id_delta)?;
write_varint(&mut compressed, line_no_delta)?;
write_varint(&mut compressed, byte_offset_delta)?;
prev_file_id = loc.file_id;
prev_line_no = loc.line_no;
prev_byte_offset = loc.byte_offset;
}
let compressed_size = compressed.len() as u32;
writer.write_all(&compressed)?;
Ok(compressed_size)
}
#[allow(dead_code)]
fn merge_partial_indices(&mut self) -> Result<()> {
use std::io::{BufReader, Read};
let mut all_entries: Vec<(Trigram, FileLocation)> = Vec::new();
for partial_path in &self.partial_indices {
let file = File::open(partial_path)
.with_context(|| format!("Failed to open partial index: {:?}", partial_path))?;
let mut reader = BufReader::with_capacity(16 * 1024 * 1024, file);
let mut buf = [0u8; 8];
reader.read_exact(&mut buf)?;
let num_trigrams = u64::from_le_bytes(buf) as usize;
for _ in 0..num_trigrams {
let mut trigram_buf = [0u8; 4];
reader.read_exact(&mut trigram_buf)?;
let trigram = u32::from_le_bytes(trigram_buf);
let mut len_buf = [0u8; 4];
reader.read_exact(&mut len_buf)?;
let list_len = u32::from_le_bytes(len_buf) as usize;
for _ in 0..list_len {
let mut loc_buf = [0u8; 12]; reader.read_exact(&mut loc_buf)?;
let file_id = u32::from_le_bytes([loc_buf[0], loc_buf[1], loc_buf[2], loc_buf[3]]);
let line_no = u32::from_le_bytes([loc_buf[4], loc_buf[5], loc_buf[6], loc_buf[7]]);
let byte_offset = u32::from_le_bytes([loc_buf[8], loc_buf[9], loc_buf[10], loc_buf[11]]);
all_entries.push((trigram, FileLocation { file_id, line_no, byte_offset }));
}
}
}
log::info!("Read {} total trigram entries from {} partial indices",
all_entries.len(), self.partial_indices.len());
let mut index_map: HashMap<Trigram, Vec<FileLocation>> = HashMap::new();
for (trigram, location) in all_entries {
index_map
.entry(trigram)
.or_insert_with(Vec::new)
.push(location);
}
self.index = index_map.into_iter().collect();
for (_, list) in self.index.iter_mut() {
list.sort_unstable();
list.dedup();
}
self.index.sort_unstable_by_key(|(trigram, _)| *trigram);
for partial_path in &self.partial_indices {
let _ = std::fs::remove_file(partial_path);
}
if let Some(ref temp_dir) = self.temp_dir {
let _ = std::fs::remove_dir(temp_dir);
}
log::info!("Merged into final index with {} trigrams", self.index.len());
Ok(())
}
pub fn search(&self, pattern: &str) -> Vec<FileLocation> {
if pattern.len() < 3 {
return vec![];
}
let trigrams = extract_trigrams(pattern);
if trigrams.is_empty() {
return vec![];
}
if let Some(ref mmap) = self.mmap {
let mut posting_lists: Vec<Vec<FileLocation>> = Vec::new();
for trigram in &trigrams {
match self.directory.binary_search_by_key(trigram, |e| e.trigram) {
Ok(idx) => {
let entry = &self.directory[idx];
match decompress_posting_list(mmap, entry.data_offset, entry.compressed_size) {
Ok(locations) => posting_lists.push(locations),
Err(e) => {
log::warn!("Failed to decompress posting list for trigram {}: {}", trigram, e);
return vec![];
}
}
}
Err(_) => {
return vec![];
}
}
}
if posting_lists.is_empty() || posting_lists.len() < trigrams.len() {
return vec![];
}
posting_lists.sort_by_key(|list| list.len());
intersect_by_file_owned(&posting_lists)
} else {
let mut posting_lists: Vec<&Vec<FileLocation>> = trigrams
.iter()
.filter_map(|t| {
self.index
.binary_search_by_key(t, |(trigram, _)| *trigram)
.ok()
.map(|idx| &self.index[idx].1)
})
.collect();
if posting_lists.is_empty() {
return vec![];
}
if posting_lists.len() < trigrams.len() {
return vec![];
}
posting_lists.sort_by_key(|list| list.len());
intersect_by_file(&posting_lists)
}
}
pub fn get_posting_list(&self, trigram: Trigram) -> Option<&Vec<FileLocation>> {
self.index
.binary_search_by_key(&trigram, |(t, _)| *t)
.ok()
.map(|idx| &self.index[idx].1)
}
pub fn write(&mut self, path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
if !self.partial_indices.is_empty() {
log::info!("Using streaming merge to write {} partial indices", self.partial_indices.len());
return self.merge_partial_indices_to_file(path);
}
let file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(path)
.with_context(|| format!("Failed to create {}", path.display()))?;
let mut writer = std::io::BufWriter::with_capacity(16 * 1024 * 1024, file);
writer.write_all(MAGIC)?;
writer.write_all(&VERSION.to_le_bytes())?;
writer.write_all(&(self.index.len() as u64).to_le_bytes())?; writer.write_all(&(self.files.len() as u64).to_le_bytes())?;
let mut directory: Vec<DirectoryEntry> = Vec::with_capacity(self.index.len());
let directory_start = HEADER_SIZE as u64;
let directory_size = self.index.len() * 16;
let data_start = directory_start + directory_size as u64;
let mut current_offset = data_start;
let mut compressed_lists: Vec<(Trigram, Vec<u8>)> = Vec::with_capacity(self.index.len());
for (trigram, locations) in &self.index {
let mut compressed = Vec::new();
let mut prev_file_id = 0u32;
let mut prev_line_no = 0u32;
let mut prev_byte_offset = 0u32;
for loc in locations {
let file_id_delta = loc.file_id.wrapping_sub(prev_file_id);
let line_no_delta = loc.line_no.wrapping_sub(prev_line_no);
let byte_offset_delta = loc.byte_offset.wrapping_sub(prev_byte_offset);
write_varint(&mut compressed, file_id_delta)?;
write_varint(&mut compressed, line_no_delta)?;
write_varint(&mut compressed, byte_offset_delta)?;
prev_file_id = loc.file_id;
prev_line_no = loc.line_no;
prev_byte_offset = loc.byte_offset;
}
directory.push(DirectoryEntry {
trigram: *trigram,
data_offset: current_offset,
compressed_size: compressed.len() as u32,
});
current_offset += compressed.len() as u64;
compressed_lists.push((*trigram, compressed));
}
for entry in &directory {
writer.write_all(&entry.trigram.to_le_bytes())?;
writer.write_all(&entry.data_offset.to_le_bytes())?;
writer.write_all(&entry.compressed_size.to_le_bytes())?;
}
for (_, compressed) in &compressed_lists {
writer.write_all(compressed)?;
}
for file_path in &self.files {
let path_str = file_path.to_string_lossy();
let path_bytes = path_str.as_bytes();
write_varint(&mut writer, path_bytes.len() as u32)?;
writer.write_all(path_bytes)?;
}
writer.flush()?;
writer.get_ref().sync_all()?;
log::info!(
"Wrote lazy-loadable trigram index: {} trigrams, {} files to {:?}",
self.index.len(),
self.files.len(),
path
);
Ok(())
}
pub fn load(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)
.with_context(|| format!("Failed to open {}", path.display()))?;
let mmap = unsafe {
memmap2::Mmap::map(&file)
.with_context(|| format!("Failed to mmap {}", path.display()))?
};
if mmap.len() < HEADER_SIZE {
anyhow::bail!("trigrams.bin too small (expected at least {} bytes)", HEADER_SIZE);
}
if &mmap[0..4] != MAGIC {
anyhow::bail!("Invalid trigrams.bin (wrong magic bytes)");
}
let version = u32::from_le_bytes([mmap[4], mmap[5], mmap[6], mmap[7]]);
if version != VERSION {
anyhow::bail!(
"Unsupported trigrams.bin version: {} (expected {}). Please re-index with 'reflex index'.",
version, VERSION
);
}
let num_trigrams = u64::from_le_bytes([
mmap[8], mmap[9], mmap[10], mmap[11],
mmap[12], mmap[13], mmap[14], mmap[15],
]) as usize;
let num_files = u64::from_le_bytes([
mmap[16], mmap[17], mmap[18], mmap[19],
mmap[20], mmap[21], mmap[22], mmap[23],
]) as usize;
log::debug!("Loading lazy trigram index: {} trigrams, {} files", num_trigrams, num_files);
let mut directory = Vec::with_capacity(num_trigrams);
let mut pos = HEADER_SIZE;
let directory_size = num_trigrams * 16;
for _ in 0..num_trigrams {
if pos + 16 > mmap.len() {
anyhow::bail!("Truncated directory entry at pos={}", pos);
}
let trigram = u32::from_le_bytes([
mmap[pos],
mmap[pos + 1],
mmap[pos + 2],
mmap[pos + 3],
]);
pos += 4;
let data_offset = u64::from_le_bytes([
mmap[pos],
mmap[pos + 1],
mmap[pos + 2],
mmap[pos + 3],
mmap[pos + 4],
mmap[pos + 5],
mmap[pos + 6],
mmap[pos + 7],
]);
pos += 8;
let compressed_size = u32::from_le_bytes([
mmap[pos],
mmap[pos + 1],
mmap[pos + 2],
mmap[pos + 3],
]);
pos += 4;
directory.push(DirectoryEntry {
trigram,
data_offset,
compressed_size,
});
}
directory.sort_unstable_by_key(|e| e.trigram);
let data_section_size: u64 = directory.iter().map(|e| e.compressed_size as u64).sum();
let files_section_offset = HEADER_SIZE + directory_size + data_section_size as usize;
pos = files_section_offset;
let mut files = Vec::with_capacity(num_files);
for _ in 0..num_files {
let (path_len, consumed) = read_varint(&mmap[pos..])?;
pos += consumed;
let path_len = path_len as usize;
if pos + path_len > mmap.len() {
anyhow::bail!("Truncated file path at pos={}", pos);
}
let path_bytes = &mmap[pos..pos + path_len];
let path_str = std::str::from_utf8(path_bytes)
.context("Invalid UTF-8 in file path")?;
files.push(PathBuf::from(path_str));
pos += path_len;
}
log::info!(
"Loaded lazy trigram index: {} trigrams, {} files (directory: {} KB)",
num_trigrams,
num_files,
directory_size / 1024
);
Ok(Self {
index: Vec::new(), files,
temp_index: None,
mmap: Some(mmap), directory,
partial_indices: Vec::new(),
temp_dir: None,
})
}
}
impl Default for TrigramIndex {
fn default() -> Self {
Self::new()
}
}
pub fn extract_trigrams(text: &str) -> Vec<Trigram> {
let bytes = text.as_bytes();
let mut trigrams = Vec::new();
for i in 0..bytes.len().saturating_sub(2) {
let trigram = bytes_to_trigram(&bytes[i..i + 3]);
trigrams.push(trigram);
}
trigrams
}
pub fn extract_trigrams_with_locations(text: &str, file_id: u32) -> Vec<(Trigram, FileLocation)> {
let bytes = text.as_bytes();
let mut result = Vec::new();
let mut line_no = 1;
for (i, &byte) in bytes.iter().enumerate() {
if byte == b'\n' {
line_no += 1;
}
if i + 2 < bytes.len() {
let trigram = bytes_to_trigram(&bytes[i..i + 3]);
let location = FileLocation::new(file_id, line_no, i as u32);
result.push((trigram, location));
}
}
result
}
#[inline]
fn bytes_to_trigram(bytes: &[u8]) -> Trigram {
debug_assert_eq!(bytes.len(), 3);
(bytes[0] as u32) << 16 | (bytes[1] as u32) << 8 | (bytes[2] as u32)
}
#[allow(dead_code)]
fn trigram_to_bytes(trigram: Trigram) -> [u8; 3] {
[
((trigram >> 16) & 0xFF) as u8,
((trigram >> 8) & 0xFF) as u8,
(trigram & 0xFF) as u8,
]
}
fn intersect_by_file(lists: &[&Vec<FileLocation>]) -> Vec<FileLocation> {
if lists.is_empty() {
return vec![];
}
use std::collections::HashSet;
let mut candidates: HashSet<(u32, u32)> = lists[0]
.iter()
.map(|loc| (loc.file_id, loc.line_no))
.collect();
for &list in &lists[1..] {
let list_pairs: HashSet<(u32, u32)> = list
.iter()
.map(|loc| (loc.file_id, loc.line_no))
.collect();
candidates.retain(|pair| list_pairs.contains(pair));
}
let mut result = Vec::new();
for &(file_id, line_no) in &candidates {
if let Some(loc) = lists[0]
.iter()
.find(|loc| loc.file_id == file_id && loc.line_no == line_no)
{
result.push(*loc);
}
}
result.sort_unstable();
result
}
fn intersect_by_file_owned(lists: &[Vec<FileLocation>]) -> Vec<FileLocation> {
if lists.is_empty() {
return vec![];
}
use std::collections::HashSet;
let mut candidates: HashSet<(u32, u32)> = lists[0]
.iter()
.map(|loc| (loc.file_id, loc.line_no))
.collect();
for list in &lists[1..] {
let list_pairs: HashSet<(u32, u32)> = list
.iter()
.map(|loc| (loc.file_id, loc.line_no))
.collect();
candidates.retain(|pair| list_pairs.contains(pair));
}
let mut result = Vec::new();
for &(file_id, line_no) in &candidates {
if let Some(loc) = lists[0]
.iter()
.find(|loc| loc.file_id == file_id && loc.line_no == line_no)
{
result.push(*loc);
}
}
result.sort_unstable();
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_trigrams() {
let text = "hello";
let trigrams = extract_trigrams(text);
assert_eq!(trigrams.len(), 3);
let expected = vec![
bytes_to_trigram(b"hel"),
bytes_to_trigram(b"ell"),
bytes_to_trigram(b"llo"),
];
assert_eq!(trigrams, expected);
}
#[test]
fn test_extract_trigrams_short() {
assert_eq!(extract_trigrams("ab").len(), 0);
assert_eq!(extract_trigrams("abc").len(), 1);
}
#[test]
fn test_bytes_to_trigram() {
let trigram1 = bytes_to_trigram(b"abc");
let trigram2 = bytes_to_trigram(b"abc");
let trigram3 = bytes_to_trigram(b"xyz");
assert_eq!(trigram1, trigram2);
assert_ne!(trigram1, trigram3);
}
#[test]
fn test_trigram_roundtrip() {
let original = b"foo";
let trigram = bytes_to_trigram(original);
let recovered = trigram_to_bytes(trigram);
assert_eq!(original, &recovered);
}
#[test]
fn test_extract_with_locations() {
let text = "hello\nworld";
let locs = extract_trigrams_with_locations(text, 0);
assert_eq!(locs.len(), 9);
assert_eq!(locs[0].1.line_no, 1);
let world_start = text.find("world").unwrap();
let world_trigram_idx = locs
.iter()
.position(|(_, loc)| loc.byte_offset as usize == world_start)
.unwrap();
assert_eq!(locs[world_trigram_idx].1.line_no, 2);
}
#[test]
fn test_trigram_index_basic() {
let mut index = TrigramIndex::new();
let file_id = index.add_file(PathBuf::from("test.txt"));
index.index_file(file_id, "hello world");
index.finalize();
let results = index.search("hello");
assert!(!results.is_empty());
let results = index.search("world");
assert!(!results.is_empty());
let results = index.search("goodbye");
assert!(results.is_empty());
}
#[test]
fn test_search_multifile() {
let mut index = TrigramIndex::new();
let file1 = index.add_file(PathBuf::from("file1.txt"));
let file2 = index.add_file(PathBuf::from("file2.txt"));
index.index_file(file1, "extract_symbols is here");
index.index_file(file2, "extract_symbols is also here");
index.finalize();
let results = index.search("extract_symbols");
assert_eq!(results.len(), 2);
let file_ids: Vec<u32> = results.iter().map(|loc| loc.file_id).collect();
assert!(file_ids.contains(&file1));
assert!(file_ids.contains(&file2));
}
#[test]
fn test_persistence_write() {
use tempfile::TempDir;
let temp = TempDir::new().unwrap();
let trigrams_path = temp.path().join("trigrams.bin");
let mut index = TrigramIndex::new();
let file1 = index.add_file(PathBuf::from("src/main.rs"));
let file2 = index.add_file(PathBuf::from("src/lib.rs"));
index.index_file(file1, "fn main() { println!(\"hello\"); }");
index.index_file(file2, "pub fn hello() -> String { String::from(\"hello\") }");
index.finalize();
index.write(&trigrams_path).unwrap();
assert!(trigrams_path.exists());
let metadata = std::fs::metadata(&trigrams_path).unwrap();
assert!(metadata.len() > HEADER_SIZE as u64);
use std::io::Read;
let mut file = File::open(&trigrams_path).unwrap();
let mut magic = [0u8; 4];
file.read_exact(&mut magic).unwrap();
assert_eq!(&magic, MAGIC);
}
}