syntext 1.1.1

Hybrid code search index for agent workflows
Documentation
use std::collections::HashMap;
use std::path::{Path, PathBuf};

use roaring::RoaringBitmap;
use tempfile::TempDir;

use super::*;
use crate::index::overlay::OverlayView;
use crate::index::segment::MmapSegment;
use crate::index::snapshot::{new_snapshot, BaseSegments};
use crate::path::PathIndex;

fn build_snapshot(
    segments: &[Vec<(u32, &'static str, u64)>],
    overlay: OverlayView,
    delete_set: RoaringBitmap,
) -> (TempDir, IndexSnapshot, Vec<SegmentRef>) {
    let dir = TempDir::new().unwrap();
    let mut mmap_segments = Vec::new();
    let mut seg_refs = Vec::new();
    let mut base_ids = Vec::new();
    let mut base_doc_paths: Vec<Option<PathBuf>> = Vec::new();
    let mut path_doc_ids: HashMap<PathBuf, Vec<u32>> = HashMap::new();
    let mut all_paths = Vec::new();
    let mut total_docs = 0u32;

    for (seg_idx, docs) in segments.iter().enumerate() {
        let mut writer = SegmentWriter::new();
        let base_id = docs.first().map(|doc| doc.0).unwrap_or(total_docs);
        base_ids.push(base_id);
        for &(doc_id, path, size_bytes) in docs {
            writer.add_document(doc_id, Path::new(path), doc_id as u64, size_bytes);
            if base_doc_paths.len() <= doc_id as usize {
                base_doc_paths.resize(doc_id as usize + 1, None);
            }
            base_doc_paths[doc_id as usize] = Some(PathBuf::from(path));
            path_doc_ids
                .entry(PathBuf::from(path))
                .or_default()
                .push(doc_id);
            all_paths.push(PathBuf::from(path));
            total_docs = total_docs.max(doc_id.saturating_add(1));
        }
        let meta = writer
            .write_to_dir(dir.path())
            .unwrap_or_else(|_| panic!("failed to write segment {seg_idx}"));
        let seg_ref: SegmentRef = meta.clone().into();
        seg_refs.push(seg_ref);
        mmap_segments.push(
            MmapSegment::open_split(
                &dir.path().join(&meta.dict_filename),
                &dir.path().join(&meta.post_filename),
            )
            .unwrap(),
        );
    }

    all_paths.sort_unstable();
    all_paths.dedup();
    let path_index = PathIndex::build(&all_paths);
    let mut base_doc_to_file_id = vec![u32::MAX; total_docs as usize];
    for (global_doc_id, path) in base_doc_paths.iter().enumerate() {
        if let Some(path) = path {
            if let Some(file_id) = path_index.file_id(path) {
                base_doc_to_file_id[global_doc_id] = file_id;
            }
        }
    }

    let snapshot = new_snapshot(
        Arc::new(BaseSegments {
            segments: mmap_segments,
            base_ids,
            base_doc_paths,
            path_doc_ids,
        }),
        overlay,
        delete_set,
        path_index,
        Arc::new(base_doc_to_file_id),
        HashMap::new(),
        0.10,
    );
    (dir, snapshot, seg_refs)
}

#[test]
fn plan_uses_segment_limit_and_snapshot_sizes() {
    let (_dir, snapshot, _seg_refs) = build_snapshot(
        &[
            vec![(0, "a.rs", 300_000_000)],
            vec![(1, "b.rs", 400_000_000)],
            vec![(2, "c.rs", 500_000_000)],
        ],
        OverlayView::empty(),
        RoaringBitmap::new(),
    );
    let config = Config {
        max_segments: 2,
        ..Config::default()
    };

    let plan = plan(&snapshot, &config).unwrap();
    assert_eq!(plan.reason, CompactionReason::SegmentLimit);
    assert_eq!(plan.suffix_start, 1);
    assert_eq!(plan.target_segments, 1);
    assert_eq!(plan.batch_size_bytes, 900_000_000);
}

#[test]
fn plan_ignores_deleted_base_docs_when_sizing() {
    let mut delete_set = RoaringBitmap::new();
    delete_set.insert(0);
    let (_dir, snapshot, _seg_refs) = build_snapshot(
        &[
            vec![(0, "a.rs", 300_000_000)],
            vec![(1, "b.rs", 500_000_000)],
        ],
        OverlayView::empty(),
        delete_set,
    );
    let config = Config {
        max_segments: 1,
        ..Config::default()
    };

    let plan = plan(&snapshot, &config).unwrap();
    assert_eq!(plan.reason, CompactionReason::SegmentLimit);
    assert_eq!(plan.suffix_start, 0);
    assert_eq!(plan.batch_size_bytes, 500_000_000);
}

#[test]
fn plan_prioritizes_overlay_ratio_trigger() {
    let overlay = OverlayView::build(
        10,
        vec![
            (
                PathBuf::from("dirty_1.rs"),
                Arc::from(&b"fn dirty_1() {}\n"[..]),
            ),
            (
                PathBuf::from("dirty_2.rs"),
                Arc::from(&b"fn dirty_2() {}\n"[..]),
            ),
        ],
    )
    .unwrap();
    let (_dir, snapshot, _seg_refs) = build_snapshot(
        &[
            vec![(0, "base_0.rs", 10)],
            vec![(1, "base_1.rs", 10)],
            vec![(2, "base_2.rs", 10)],
            vec![(3, "base_3.rs", 10)],
            vec![(4, "base_4.rs", 10)],
            vec![(5, "base_5.rs", 10)],
            vec![(6, "base_6.rs", 10)],
            vec![(7, "base_7.rs", 10)],
            vec![(8, "base_8.rs", 10)],
            vec![(9, "base_9.rs", 10)],
        ],
        overlay,
        RoaringBitmap::new(),
    );
    let config = Config {
        max_segments: 20,
        ..Config::default()
    };

    let plan = plan(&snapshot, &config).unwrap();
    assert_eq!(plan.reason, CompactionReason::OverlayRatio);
    assert_eq!(plan.suffix_start, 10);
    assert_eq!(plan.target_segments, 1);
    assert_eq!(plan.batch_size_bytes, super::super::build::BATCH_SIZE_BYTES);
}

#[test]
fn forced_plan_rewrites_from_earliest_deleted_segment() {
    let mut delete_set = RoaringBitmap::new();
    delete_set.insert(1);
    let (_dir, snapshot, _seg_refs) = build_snapshot(
        &[
            vec![(0, "a.rs", 10)],
            vec![(1, "b.rs", 10)],
            vec![(2, "c.rs", 10)],
        ],
        OverlayView::empty(),
        delete_set,
    );
    let config = Config {
        max_segments: 10,
        ..Config::default()
    };

    let plan = forced_plan(&snapshot, &config).unwrap();
    assert_eq!(plan.reason, CompactionReason::ExplicitRequest);
    assert_eq!(plan.suffix_start, 1);
    assert_eq!(plan.target_segments, 1);
}

#[test]
fn compact_rejects_snapshot_manifest_base_id_divergence() {
    let repo = TempDir::new().unwrap();
    let (index_dir, snapshot, mut seg_refs) = build_snapshot(
        &[vec![(0, "a.rs", 10)]],
        OverlayView::empty(),
        RoaringBitmap::new(),
    );
    seg_refs[0].base_doc_id = Some(7);
    let manifest = Manifest::new(seg_refs, 1);
    manifest.save(index_dir.path()).unwrap();

    let config = Config {
        index_dir: index_dir.path().to_path_buf(),
        repo_root: repo.path().to_path_buf(),
        ..Config::default()
    };
    let plan = CompactionPlan {
        reason: CompactionReason::ExplicitRequest,
        suffix_start: 0,
        batch_size_bytes: 1,
        target_segments: 1,
    };

    let write_lock = crate::index::helpers::acquire_writer_lock(&config.index_dir)
        .expect("acquire write lock for test");
    let result = compact_index(config, Arc::new(snapshot), plan, write_lock);
    let err = match result {
        Err(IndexError::CorruptIndex(msg)) => msg,
        Ok(_) => panic!("expected CorruptIndex for base-id divergence, got Ok(_)"),
        Err(other) => panic!("expected CorruptIndex for base-id divergence, got {other}"),
    };
    assert!(
        err.contains("snapshot base_id[0]=0 diverges from manifest base[0]=7"),
        "unexpected error: {err}"
    );
}