use std::fs;
use std::io::Read;
use std::path::{Component, Path, PathBuf};
#[cfg(feature = "fs2")]
use fs2::FileExt;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
use roaring::RoaringBitmap;
use xxhash_rust::xxh64::xxh64;
use crate::index::manifest::{Manifest, SegmentRef};
use crate::index::segment::SegmentWriter;
#[cfg(feature = "ignore")]
use crate::index::walk::enumerate_files;
use crate::index::walk::is_binary;
use crate::index::walk::{split_batches, FileRecord};
use crate::tokenizer::build_all;
use crate::{Config, IndexError};
pub(super) const BATCH_SIZE_BYTES: u64 = 256 * 1024 * 1024;
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct ExternalFileRecord {
pub absolute_path: PathBuf,
pub relative_path: PathBuf,
pub size_bytes: u64,
}
pub(super) fn calibrate_threshold(indexed_paths: &[PathBuf]) -> f64 {
const DEFAULT: f64 = 0.10;
const SCAN_SAMPLE: usize = 100;
const BITMAP_ENTRIES: u32 = 10_000;
const BITMAP_REPS: u32 = 100;
let total = indexed_paths.len();
if total == 0 {
return DEFAULT;
}
let sample_count = SCAN_SAMPLE.min(total);
let stride = (total / sample_count).max(1);
let sample_paths: Vec<&Path> = (0..sample_count)
.map(|i| indexed_paths[(i * stride).min(total - 1)].as_path())
.collect();
for path in &sample_paths {
let _ = std::fs::read(path);
}
let mut docs_read = 0usize;
let mut scan_elapsed_ns = 0u128;
for path in &sample_paths {
let t0 = std::time::Instant::now();
if std::fs::read(path).is_ok() {
docs_read += 1;
scan_elapsed_ns += t0.elapsed().as_nanos();
}
}
if docs_read == 0 || scan_elapsed_ns == 0 {
return DEFAULT;
}
let scan_ns_per_doc = (scan_elapsed_ns / docs_read as u128) as u64;
let a: RoaringBitmap = (0..BITMAP_ENTRIES).collect();
let b: RoaringBitmap = (0..BITMAP_ENTRIES * 2).step_by(2).collect();
let t1 = std::time::Instant::now();
for _ in 0..BITMAP_REPS {
let _ = &a & &b;
}
let posting_elapsed_ns = t1.elapsed().as_nanos();
let total_entries_processed = BITMAP_ENTRIES as u64 * BITMAP_REPS as u64 * 2;
if posting_elapsed_ns == 0 {
return DEFAULT;
}
let posting_ns_per_entry = (posting_elapsed_ns / total_entries_processed as u128) as u64;
if posting_ns_per_entry == 0 {
return 0.50;
}
let threshold = scan_ns_per_doc as f64 / (scan_ns_per_doc + posting_ns_per_entry) as f64;
threshold.clamp(0.01, 0.50)
}
#[cfg(feature = "ignore")]
pub(super) fn build_index(config: Config) -> Result<super::Index, IndexError> {
let file_list = enumerate_files(&config)?;
build_index_from_file_list(config, file_list, BATCH_SIZE_BYTES)
}
#[cfg(all(test, feature = "ignore"))]
pub(super) fn build_index_with_batch_size(
config: Config,
batch_size_bytes: u64,
) -> Result<super::Index, IndexError> {
let file_list = enumerate_files(&config)?;
build_index_from_file_list(config, file_list, batch_size_bytes)
}
pub(super) fn build_index_from_external_records(
config: Config,
records: Vec<ExternalFileRecord>,
) -> Result<super::Index, IndexError> {
build_index_from_file_list(
config,
normalize_external_records(records)?,
BATCH_SIZE_BYTES,
)
}
fn normalize_external_records(
records: Vec<ExternalFileRecord>,
) -> Result<Vec<FileRecord>, IndexError> {
let mut file_list = Vec::with_capacity(records.len());
for record in records {
if record.relative_path.components().any(|component| {
matches!(
component,
Component::ParentDir | Component::RootDir | Component::Prefix(_)
)
}) {
return Err(IndexError::PathOutsideRepo(record.relative_path));
}
file_list.push((
record.absolute_path,
crate::path_util::normalize_to_forward_slashes(record.relative_path),
record.size_bytes,
));
}
file_list.sort_unstable_by(|left, right| left.1.cmp(&right.1));
Ok(file_list)
}
fn build_index_from_file_list(
config: Config,
file_list: Vec<FileRecord>,
batch_size_bytes: u64,
) -> Result<super::Index, IndexError> {
fs::create_dir_all(&config.index_dir)?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
fs::set_permissions(&config.index_dir, fs::Permissions::from_mode(0o700))?;
}
let lock_file = super::helpers::open_dir_lock_file(&config.index_dir)?;
lock_file
.try_lock_exclusive()
.map_err(|_| IndexError::LockConflict(config.index_dir.clone()))?;
let write_lock = super::helpers::acquire_writer_lock(&config.index_dir)?;
if let Ok(prev_manifest) = Manifest::load(&config.index_dir) {
if let Err(e) = prev_manifest.gc_orphan_segments(&config.index_dir) {
if config.verbose {
eprintln!("syntext: startup gc: {e}");
}
}
}
let total_candidate = file_list.len();
if config.verbose {
eprintln!("syntext: indexing {} candidate files", total_candidate);
}
let batches = split_batches(&file_list, batch_size_bytes.max(1));
let mut seg_refs: Vec<SegmentRef> = Vec::new();
let mut indexed_files: Vec<(PathBuf, PathBuf)> = Vec::with_capacity(total_candidate);
let mut next_doc_id: u32 = 0;
for batch in &batches {
let verbose = config.verbose;
let map_fn = |(abs_path, _, _): &(PathBuf, PathBuf, u64)| -> Option<(u64, Vec<u64>)> {
let pre_meta = match std::fs::symlink_metadata(abs_path) {
Ok(m) => m,
Err(e) => {
if verbose {
eprintln!("syntext: skipping {}: stat: {e}", abs_path.display());
}
return None;
}
};
let mut file = match super::open_readonly_nofollow(abs_path) {
Ok(f) => f,
Err(e) => {
if verbose {
eprintln!("syntext: skipping {}: open: {e}", abs_path.display());
}
return None;
}
};
#[cfg(any(unix, windows))]
if !super::verify_fd_matches_stat(&file, &pre_meta) {
return None;
}
#[cfg(not(any(unix, windows)))]
let _ = &pre_meta;
let mut raw = Vec::new();
if let Err(e) = file.read_to_end(&mut raw) {
if verbose {
eprintln!("syntext: skipping {}: read: {e}", abs_path.display());
}
return None;
}
let content = crate::index::normalize_encoding(&raw, config.verbose);
if is_binary(&content) {
return None;
}
let hash = xxh64(content.as_ref(), 0);
Some((hash, build_all(content.as_ref())))
};
#[cfg(feature = "rayon")]
let results: Vec<Option<(u64, Vec<u64>)>> = batch.par_iter().map(map_fn).collect();
#[cfg(not(feature = "rayon"))]
let results: Vec<Option<(u64, Vec<u64>)>> = batch.iter().map(map_fn).collect();
let batch_start_doc_id = next_doc_id;
let mut writer = SegmentWriter::with_capacity(batch.len(), 120);
for ((abs_path, rel_path, size), result) in batch.iter().zip(results.iter()) {
if let Some((content_hash, grams)) = result {
let doc_id = next_doc_id;
next_doc_id = next_doc_id
.checked_add(1)
.ok_or(IndexError::DocIdOverflow {
base_doc_count: doc_id,
overlay_docs: 0,
})?;
writer.add_document(doc_id, rel_path, *content_hash, *size);
for &gram_hash in grams {
writer.add_gram_posting(gram_hash, doc_id);
}
indexed_files.push((abs_path.clone(), rel_path.clone()));
} else {
let _ = abs_path;
}
}
if writer.doc_count() == 0 {
continue; }
let meta = writer.write_to_dir(&config.index_dir)?;
let content_size: u64 = batch
.iter()
.zip(results.iter())
.filter_map(|((_, _, size), r)| r.as_ref().map(|_| size))
.sum();
let dict_size = fs::metadata(config.index_dir.join(&meta.dict_filename))
.map(|m| m.len())
.unwrap_or(0);
let post_size = fs::metadata(config.index_dir.join(&meta.post_filename))
.map(|m| m.len())
.unwrap_or(0);
let seg_size = dict_size + post_size;
if config.verbose && seg_size > content_size / 2 && content_size > 0 {
eprintln!(
"syntext: warning: segment is {seg_size} bytes for {content_size} bytes content"
);
}
let mut seg_ref: SegmentRef = meta.into();
seg_ref.base_doc_id = Some(batch_start_doc_id);
seg_refs.push(seg_ref);
}
let total_indexed = next_doc_id;
let scan_paths: Vec<PathBuf> = indexed_files.iter().map(|(abs, _)| abs.clone()).collect();
let scan_threshold = calibrate_threshold(&scan_paths);
if config.verbose {
eprintln!("syntext: calibrated scan threshold: {:.3}", scan_threshold);
}
let mut manifest = Manifest::new(seg_refs, total_indexed);
manifest.base_commit = super::helpers::current_repo_head(&config.repo_root)?;
manifest.scan_threshold_fraction = Some(scan_threshold);
manifest.save(&config.index_dir)?;
manifest.gc_orphan_segments(&config.index_dir)?;
if config.verbose {
eprintln!(
"syntext: indexed {} files into {} segment(s)",
total_indexed,
manifest.segments.len()
);
}
#[cfg(feature = "symbols")]
{
let db_path = config.index_dir.join("symbols.db");
let _ = fs::remove_file(&db_path);
match crate::symbol::SymbolIndex::open(&db_path) {
Ok(sym_idx) => {
for (file_id, (abs_path, rel_path)) in indexed_files.iter().enumerate() {
let file_id = file_id as u32;
let Ok(raw) = fs::read(abs_path) else {
continue;
};
let content = crate::index::normalize_encoding(&raw, config.verbose);
if is_binary(&content) {
continue;
}
let rel_path_str = rel_path.to_string_lossy();
if let Err(e) = sym_idx.index_file(file_id, &rel_path_str, content.as_ref()) {
if config.verbose {
eprintln!(
"syntext: warning: symbol index failed for {}: {e}",
rel_path.display()
);
}
}
}
if config.verbose {
eprintln!("syntext: symbol index built");
}
}
Err(e) => {
if config.verbose {
eprintln!("syntext: warning: could not build symbol index: {e}");
}
}
}
}
lock_file
.unlock()
.map_err(|e| IndexError::CorruptIndex(format!("failed to unlock dir lock: {e}")))?;
lock_file
.try_lock_shared()
.map_err(|_| IndexError::LockConflict(config.index_dir.clone()))?;
drop(write_lock);
super::Index::open_with_lock(config, lock_file)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn calibrate_threshold_empty_paths_returns_default() {
let threshold = calibrate_threshold(&[]);
assert_eq!(
threshold, 0.10,
"empty path list must return default threshold 0.10, got {threshold}"
);
}
#[test]
fn calibrate_threshold_returns_clamped_value() {
let repo = TempDir::new().unwrap();
let mut absolute_paths = Vec::new();
for i in 0..5 {
let abs = repo.path().join(format!("f{i}.rs"));
std::fs::write(&abs, format!("fn test_{i}() {{}}\n")).unwrap();
absolute_paths.push(abs);
}
let threshold = calibrate_threshold(&absolute_paths);
assert!(
(0.01..=0.50).contains(&threshold),
"threshold {threshold} outside [0.01, 0.50]"
);
}
}