moeix 0.2.7

High-performance trigram code search for humans and AI agents.
Documentation
//! Index reader — the mmap-based query-time interface.
//!
//! Fast, zero-copy access to the index data.

use crate::bloom::BloomFilter;
use crate::error::{Error, Result};
use crate::format::*;
use crate::posting::PostingList;
use crate::string_pool::StringPoolReader;
use crate::trigram::Trigram;
use memmap2::Mmap;
use std::fs::File;
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;

#[cfg(unix)]
use std::os::unix::fs::MetadataExt;

#[derive(Debug, Clone, Copy)]
pub struct ShardMetadata {
    pub shard_timestamp: u64,
    pub file_count: u32,
    pub trigram_count: u32,
}

pub struct Reader {
    mmap: Mmap,
    pub header: Header,
    string_pool: StringPoolReader<'static>,
    inode: Option<u64>,
}

#[derive(Debug)]
pub struct TrigramInfo {
    pub posting_offset: u64,
    pub posting_length: u32,
    pub doc_frequency: u32,
}

#[derive(Debug)]
pub struct FileInfo {
    pub file_id: u32,
    pub path: PathBuf,
    pub status: FileStatus,
    pub mtime_ns: u64,
    pub size_bytes: u64,
    pub content_hash: u64,
}

impl Reader {
    pub fn open(path: &Path) -> Result<Self> {
        let file = File::open(path)?;

        // SAFETY: Mmap::map wraps the mmap(2) syscall. The file handle is kept alive
        // by Mmap's internal Arc<File>, ensuring the underlying data remains valid
        // for the lifetime of the mmap.
        let mmap = unsafe { Mmap::map(&file)? };

        if mmap.len() < HEADER_SIZE {
            return Err(Error::IndexTooSmall);
        }

        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
        header.validate_bounds(mmap.len() as u64)?;

        #[cfg(unix)]
        let inode = Some(file.metadata()?.ino());

        #[cfg(not(unix))]
        let inode = None;

        // SAFETY: We transmute the slice lifetime to 'static. This is sound because:
        // INVARIANT: Reader owns the Mmap, which owns the underlying memory.
        // INVARIANT: Mmap's data remains valid for the entire lifetime of Reader.
        // INVARIANT: No mutable access to mmap occurs after construction.
        // INVARIANT: StringPoolReader<'static> cannot outlive Reader (it's a field).
        // This is the standard pattern for self-referential mmap structs in Rust.
        let string_pool_data: &'static [u8] = unsafe {
            let start = header.string_pool_offset as usize;
            let end = (header.string_pool_offset + header.string_pool_size) as usize;
            std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
        };
        let string_pool = StringPoolReader::new(string_pool_data)?;

        Ok(Self {
            mmap,
            header,
            string_pool,
            inode,
        })
    }

    pub fn get_last_modified(root: &Path) -> Result<u64> {
        let mut last_modified = 0u64;
        let walker = ignore::WalkBuilder::new(root)
            .hidden(false)
            .git_ignore(true)
            .require_git(false)
            .add_custom_ignore_filename(".ixignore")
            .filter_entry(move |entry| {
                let path = entry.path();
                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");

                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
                    && matches!(
                        name,
                        "lost+found"
                            | ".git"
                            | "node_modules"
                            | "target"
                            | "__pycache__"
                            | ".tox"
                            | ".venv"
                            | "venv"
                            | ".ix"
                    )
                {
                    return false;
                }

                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
                    if matches!(
                        ext,
                        "so" | "o"
                            | "dylib"
                            | "a"
                            | "dll"
                            | "exe"
                            | "pyc"
                            | "jpg"
                            | "png"
                            | "gif"
                            | "mp4"
                            | "mp3"
                            | "pdf"
                            | "zip"
                            | "7z"
                            | "rar"
                            | "sqlite"
                            | "db"
                            | "bin"
                    ) || name.ends_with(".tar.gz")
                    {
                        return false;
                    }
                }
                true
            })
            .build();

        for result in walker {
            match result {
                Ok(entry) => {
                    if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
                        let metadata =
                            entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
                        let mtime = metadata
                            .modified()
                            .and_then(|t| {
                                t.duration_since(UNIX_EPOCH)
                                    .map_err(|_| std::io::Error::other("time went backwards"))
                            })
                            .map(|d| d.as_micros() as u64)
                            .unwrap_or(0);
                        if mtime > last_modified {
                            last_modified = mtime;
                        }
                    }
                }
                Err(e) => {
                    eprintln!("ix: warning: stale check skipping path: {}", e);
                }
            }
        }
        Ok(last_modified)
    }

    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
        let count = self.header.trigram_count as usize;
        let table_start = self.header.trigram_table_offset as usize;

        let mut low = 0;
        let mut high = count;

        while low < high {
            let mid = low + (high - low) / 2;
            let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;

            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);

            if key == trigram {
                let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;

                let mut off_bytes = [0u8; 8];
                off_bytes[..6].copy_from_slice(&entry[4..10]);
                let posting_offset = u64::from_le_bytes(off_bytes);

                let posting_length = entry
                    .get(10..14)
                    .and_then(|s| s.try_into().ok())
                    .map(u32::from_le_bytes)?;

                let doc_frequency = entry
                    .get(14..18)
                    .and_then(|s| s.try_into().ok())
                    .map(u32::from_le_bytes)?;

                return Some(TrigramInfo {
                    posting_offset,
                    posting_length,
                    doc_frequency,
                });
            } else if key < trigram {
                low = mid + 1;
            } else {
                high = mid;
            }
        }

        None
    }

    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
        let start = info.posting_offset as usize;
        let end = start + info.posting_length as usize;
        if end > self.mmap.len() {
            return Err(Error::PostingOutOfBounds);
        }
        PostingList::decode(&self.mmap[start..end])
    }

    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
        if file_id >= self.header.file_count {
            return Err(Error::FileIdOutOfBounds(file_id));
        }

        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
        let entry = self
            .mmap
            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
            .ok_or(Error::SectionOutOfBounds {
                section: "file_entry",
                offset: entry_off as u64,
                size: FILE_ENTRY_SIZE as u64,
                file_len: self.mmap.len() as u64,
            })?;

        let path_off = u32::from_le_bytes(
            entry[4..8]
                .try_into()
                .map_err(|_| Error::Config("invalid path offset".into()))?,
        );
        let status = FileStatus::from_u8(entry[10]);
        let mtime_ns = u64::from_le_bytes(
            entry[12..20]
                .try_into()
                .map_err(|_| Error::Config("invalid mtime".into()))?,
        );
        let size_bytes = u64::from_le_bytes(
            entry[20..28]
                .try_into()
                .map_err(|_| Error::Config("invalid size".into()))?,
        );
        let content_hash = u64::from_le_bytes(
            entry[28..36]
                .try_into()
                .map_err(|_| Error::Config("invalid hash".into()))?,
        );

        let path = self.string_pool.resolve(path_off)?;

        Ok(FileInfo {
            file_id,
            path: PathBuf::from(path),
            status,
            mtime_ns,
            size_bytes,
            content_hash,
        })
    }

    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
        if !self.header.has_bloom() {
            return true;
        }

        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
        let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
            return true;
        };

        let bloom_rel_off = u32::from_le_bytes(
            bloom_bytes
                .try_into()
                .expect("bloom_bytes is exactly 4 bytes"),
        );
        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;

        let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
            return true;
        };
        let size = u16::from_le_bytes(
            size_bytes
                .try_into()
                .expect("size_bytes is exactly 2 bytes"),
        ) as usize;

        let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
        let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
            return true;
        };

        BloomFilter::slice_contains(bits, num_hashes, trigram)
    }

    pub fn metadata(&self) -> ShardMetadata {
        ShardMetadata {
            shard_timestamp: self.header.created_at,
            file_count: self.header.file_count,
            trigram_count: self.header.trigram_count,
        }
    }

    /// Detect whether the shard file on disk has been rebuilt under this live mmap.
    ///
    /// Returns `true` if the inode or file size differs, or if the file no longer exists.
    /// A stale reader should be dropped and reopened.
    ///
    /// On Unix: uses inode comparison (inode changes on atomic rename).
    /// On non-Unix: uses file size comparison only (Windows file locking prevents
    /// rebuild under live mmap, so size-only detection is sufficient).
    pub fn is_stale(&self, path: &Path) -> bool {
        let current = match std::fs::metadata(path) {
            Ok(m) => m,
            Err(_) => return true,
        };

        if current.len() as usize != self.mmap.len() {
            return true;
        }

        #[cfg(unix)]
        {
            if let Some(stored_inode) = self.inode
                && current.ino() != stored_inode
            {
                return true;
            }
        }

        false
    }
}