syntext 1.1.1 - Docs.rs

//! Ripgrep correctness oracle tests.
//!
//! These tests compare syntext search results against `rg` (ripgrep) for
//! the same patterns on the same fixture corpus. For compatibility with `rg`,
//! stable cases use exact equality over `(path, line_number, line_content)`.
//! A small number of documented cases remain subset checks where syntext's
//! semantics are intentionally narrower than rg's oracle semantics.
//!
//! # Running
//!
//! ```
//! cargo test --test correctness
//! ```
//!
//! Requires `rg` on PATH. The oracle is pinned to `ripgrep 15.1.0`, and tests
//! are skipped (not failed) if `rg` is absent.
//!
//! # Test Pattern Set (T011)
//!
//! - Exact literal: `parse_query`
//! - Exact literal: `process_batch`
//! - Multi-word literal: `parse_query(` (literal with punctuation)
//! - Regex alternation: `parse_query|process_batch`
//! - Regex repetition: `parse_quer[yi]` (character class)
//! - Case-insensitive literal: `ParseQuery` (matches parseQuery, PARSE_QUERY, parsequery, ...)
//! - No-match pattern: `xyzzy_no_match_sentinel_42`
//! - Unicode content: `café` (non-ASCII identifier)
//! - Optional prefix: `(foo)?bar` -- CORRECT: index emits Grams("bar") only,
//!   NOT And(Grams("foo"), Grams("bar")). foo is optional so requiring it
//!   would be a false negative. The verifier filters candidates.
//! - Indexed regex repetition: `(fn_parse_filter_query)+` -- the required
//!   repetition preserves grams from the inner literal, so this should narrow.
//! - Dot-star fallback: `parse.*batch` -- no extractable grams spanning the
//!   `.*`; query router must fall back to full scan. syntext must still match
//!   rg after verification.
//! - Path filter: `parse_query` restricted to `*.py` files only
//! - Gitignore: `parse_query` must NOT find `build/output.txt`

use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::{Mutex, OnceLock};

#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
struct OracleMatch {
    path: String,
    line_number: u32,
    line_content: Vec<u8>,
}

fn normalize_oracle_line_content(mut line_content: Vec<u8>) -> Vec<u8> {
    // Compatibility with rg terminal output: ripgrep's raw stdout preserves a
    // trailing '\r' for CRLF lines, while syntext reports logical line content.
    // Trim one terminal-invisible CR so exact checks do not fail on encoding noise.
    if line_content.last() == Some(&b'\r') {
        line_content.pop();
    }
    line_content
}

// ---------------------------------------------------------------------------
// Oracle helpers
// ---------------------------------------------------------------------------

const EXPECTED_RG_VERSION: &str = "ripgrep 15.1.0";

/// Run `rg` on the corpus and return exact match tuples.
///
/// Returns an empty set if the pattern matches no files. Panics if `rg` is
/// not on PATH (tests skip via `rg_available()` guard before calling this).
fn rg_matches(corpus: &Path, pattern: &str, extra_flags: &[&str]) -> BTreeSet<OracleMatch> {
    let mut cmd = Command::new("rg");
    cmd.arg("--line-number")
        .arg("--no-heading")
        .arg("--color=never");

    // Extra flags (e.g. -F, -i, --glob=*.py) must come before -- and positional args
    for flag in extra_flags {
        cmd.arg(flag);
    }

    // Use the corpus .gitignore so ignored files are excluded
    cmd.arg("--").arg(pattern).arg(corpus);

    let output = cmd.output().expect("rg invocation failed");
    // rg exit code 1 = no matches (not an error)
    if !output.status.success() && output.status.code() != Some(1) {
        panic!(
            "rg failed with status {:?}: {}",
            output.status,
            String::from_utf8_lossy(&output.stderr)
        );
    }

    parse_rg_output(&output.stdout, corpus)
}

/// Parse `rg --line-number --no-heading` output into exact match tuples.
fn parse_rg_output(stdout: &[u8], corpus: &Path) -> BTreeSet<OracleMatch> {
    let mut out = BTreeSet::new();
    for line in stdout.split(|&b| b == b'\n') {
        if line.is_empty() {
            continue;
        }
        // Format: /abs/path/to/file.rs:42:matched content
        let mut parts = line.splitn(3, |&b| b == b':');
        let path_bytes = match parts.next() {
            Some(p) => p,
            None => continue,
        };
        let line_num_bytes = match parts.next() {
            Some(n) => n,
            None => continue,
        };
        let line_num: u32 = match std::str::from_utf8(line_num_bytes)
            .ok()
            .and_then(|s| s.parse().ok())
        {
            Some(n) => n,
            None => continue,
        };
        let line_content = match parts.next() {
            Some(content) => normalize_oracle_line_content(content.to_vec()),
            None => continue,
        };
        let abs = PathBuf::from(String::from_utf8_lossy(path_bytes).into_owned());
        let rel = abs
            .strip_prefix(corpus)
            .unwrap_or(&abs)
            .to_string_lossy()
            .into_owned();
        out.insert(OracleMatch {
            path: rel,
            line_number: line_num,
            line_content,
        });
    }
    out
}

/// Return true if `rg` is available on PATH.
fn rg_available() -> bool {
    Command::new("rg").arg("--version").output().is_ok()
}

/// Compatibility with rg is intentionally pinned so oracle behavior does not
/// drift silently when a new ripgrep release changes search semantics.
fn assert_rg_version_pinned() {
    let output = Command::new("rg")
        .arg("--version")
        .output()
        .expect("rg --version failed");
    assert!(
        output.status.success(),
        "rg --version failed with status {:?}",
        output.status
    );
    let stdout = String::from_utf8(output.stdout).expect("rg --version output is not UTF-8");
    let first_line = stdout.lines().next().unwrap_or("");
    assert_eq!(
        first_line, EXPECTED_RG_VERSION,
        "correctness oracle pinned to {EXPECTED_RG_VERSION}, found {first_line}"
    );
}

/// Absolute path to the fixture corpus.
fn corpus_path() -> PathBuf {
    let manifest = std::env::var("CARGO_MANIFEST_DIR")
        .expect("CARGO_MANIFEST_DIR not set; run via cargo test");
    PathBuf::from(manifest).join("tests/fixtures/corpus")
}

fn correctness_build_lock() -> &'static Mutex<()> {
    static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
    LOCK.get_or_init(|| Mutex::new(()))
}

// ---------------------------------------------------------------------------
// syntext helpers
// ---------------------------------------------------------------------------

use syntext::index::Index;
use syntext::{Config, SearchOptions};

/// Build a syntext index over the corpus in a temporary directory.
/// Returns the temp dir (kept alive) and the index handle.
fn build_test_index(corpus: &Path) -> (tempfile::TempDir, Index) {
    let tmp = tempfile::TempDir::new().expect("tempdir");
    let config = Config {
        index_dir: tmp.path().to_path_buf(),
        repo_root: corpus.to_path_buf(),
        ..Config::default()
    };
    let index = Index::build(config).expect("Index::build failed");
    (tmp, index)
}

/// Run a syntext search and return exact match tuples.
fn syntext_matches(
    index: &Index,
    _corpus: &Path,
    pattern: &str,
    case_insensitive: bool,
    path_glob: Option<&str>,
) -> BTreeSet<OracleMatch> {
    // If the glob is a simple extension filter like "*.py", use file_type
    // (extension match). Otherwise use path_filter (substring match).
    let (path_filter, file_type) = match path_glob {
        Some(g) if g.starts_with("*.") => (None, Some(g.trim_start_matches("*.").to_string())),
        Some(g) => (Some(g.to_string()), None),
        None => (None, None),
    };
    let opts = SearchOptions {
        case_insensitive,
        path_filter,
        file_type,
        ..SearchOptions::default()
    };
    let results = index.search(pattern, &opts).expect("search failed");
    results
        .into_iter()
        .map(|m| OracleMatch {
            path: m.path.to_string_lossy().into_owned(),
            line_number: m.line_number,
            line_content: normalize_oracle_line_content(m.line_content),
        })
        .collect()
}

/// Assert that syntext produces a superset of rg results (zero false negatives).
///
/// Keep this helper only for cases where syntext intentionally implements
/// semantics that are narrower than rg's full CLI oracle, such as the `*.py`
/// path-filter compatibility check below.
///
/// syntext may return more candidates (false positives that survive to the
/// result set indicate a verifier bug, not an index bug), but it must never
/// miss a file that rg found.
#[allow(dead_code)]
fn assert_no_false_negatives(
    corpus: &Path,
    rg_result: &BTreeSet<OracleMatch>,
    syntext_result: &BTreeSet<OracleMatch>,
    pattern: &str,
) {
    let missed: Vec<_> = rg_result.difference(syntext_result).collect();
    assert!(
        missed.is_empty(),
        "syntext missed {} matches for pattern {:?} on corpus {:?}.\n\
         First missed: {:?}\n\
         rg found {} total, syntext found {}.",
        missed.len(),
        pattern,
        corpus,
        missed.first(),
        rg_result.len(),
        syntext_result.len(),
    );
}

/// Assert exact equality between rg and syntext results.
#[allow(dead_code)]
fn assert_exact_match(
    corpus: &Path,
    rg_result: &BTreeSet<OracleMatch>,
    syntext_result: &BTreeSet<OracleMatch>,
    pattern: &str,
) {
    assert_eq!(
        rg_result, syntext_result,
        "syntext results differ from rg for pattern {:?} on corpus {:?}",
        pattern, corpus
    );
}

// ---------------------------------------------------------------------------
// T011: Test pattern set
//
// These tests define the correctness contract. They are skipped if rg is
// unavailable, and otherwise pin the oracle to a specific ripgrep version.
// ---------------------------------------------------------------------------

/// Exact literal: `parse_query` appears in 3+ files.
#[test]
fn literal_parse_query() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "parse_query", &[]);
    assert!(
        rg_result.len() >= 3,
        "fixture invariant: parse_query must appear in >=3 files, got {}",
        rg_result.len()
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse_query", false, None);
    // Compatibility with rg: exact literal matches should agree on path, line,
    // and visible line content, not just candidate inclusion.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "parse_query");
    drop(index);
}

/// Exact literal: `process_batch` appears in 2+ files.
#[test]
fn literal_process_batch() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "process_batch", &[]);
    assert!(
        rg_result.len() >= 2,
        "fixture invariant: process_batch must appear in >=2 files, got {}",
        rg_result.len()
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "process_batch", false, None);
    // Compatibility with rg: exact literal matches should agree on path, line,
    // and visible line content, not just candidate inclusion.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "process_batch");
    drop(index);
}

/// Literal with punctuation: `parse_query(` -- the `(` is part of the literal.
#[test]
fn literal_with_punctuation() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    // rg treats this as a fixed string with -F
    let rg_result = rg_matches(&corpus, "parse_query(", &["-F"]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: parse_query( must appear in at least 1 file"
    );
    // parse_query( contains '(' which is a regex metacharacter, so syntext
    // treats it as regex. Use the escaped form for the regex engine.
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, r"parse_query\(", false, None);
    // Compatibility with rg: escaped punctuation should still produce the same
    // final match set and visible line content as rg's fixed-string mode here.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "parse_query(");
    drop(index);
}

/// Regex alternation: `parse_query|process_batch`.
#[test]
fn regex_alternation() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "parse_query|process_batch", &[]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: alternation must match at least one file"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse_query|process_batch", false, None);
    // Compatibility with rg: this indexed regex is expected to produce exact
    // parity with rg on the fixture corpus.
    assert_exact_match(
        &corpus,
        &rg_result,
        &syntext_result,
        "parse_query|process_batch",
    );
    drop(index);
}

/// Regex character class: `parse_quer[yi]` (matches parse_query and parse_queri).
#[test]
fn regex_character_class() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "parse_quer[yi]", &[]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: character class must match at least 1 file"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse_quer[yi]", false, None);
    // Compatibility with rg: simple indexed character-class regexes should be exact.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "parse_quer[yi]");
    drop(index);
}

/// Indexed regex repetition: `(fn_parse_filter_query)+`.
#[test]
fn indexed_regex_repetition() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "(fn_parse_filter_query)+", &[]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: repetition pattern must match at least 1 file"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "(fn_parse_filter_query)+", false, None);
    // Compatibility with rg: required repetition preserves literal grams, so
    // this remains an exact oracle check instead of only a subset check.
    assert_exact_match(
        &corpus,
        &rg_result,
        &syntext_result,
        "(fn_parse_filter_query)+",
    );
    drop(index);
}

/// Case-insensitive literal: `-i ParseQuery` matches parseQuery, PARSE_QUERY, etc.
#[test]
fn case_insensitive_literal() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_ci = rg_matches(&corpus, "ParseQuery", &["-i"]);
    let rg_cs = rg_matches(&corpus, "ParseQuery", &[]);
    assert!(
        rg_ci.len() >= rg_cs.len(),
        "case-insensitive must find at least as many matches as case-sensitive"
    );
    assert!(
        !rg_ci.is_empty(),
        "fixture invariant: case-insensitive ParseQuery must match at least 1 file"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "ParseQuery", true, None);
    // Compatibility with rg: case-folded literal search should agree exactly
    // on the final visible matches for this UTF-8 fixture corpus.
    assert_exact_match(
        &corpus,
        &rg_ci,
        &syntext_result,
        "ParseQuery (case-insensitive)",
    );
    drop(index);
}

/// No-match pattern: must return empty result set.
#[test]
fn no_match_pattern() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "xyzzy_no_match_sentinel_42", &[]);
    assert!(
        rg_result.is_empty(),
        "sentinel pattern must not appear in any fixture file"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result =
        syntext_matches(&index, &corpus, "xyzzy_no_match_sentinel_42", false, None);
    assert!(
        syntext_result.is_empty(),
        "syntext must return empty for no-match sentinel, got {:?}",
        syntext_result
    );
    drop(index);
}

/// Unicode content: `café` (non-ASCII in a Python identifier).
#[test]
fn unicode_identifier() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "café", &[]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: unicode_identifiers.py must contain 'café'"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "café", false, None);
    // Compatibility with rg: UTF-8 literal search should be exact on supported text files.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "café");
    drop(index);
}

/// Optional prefix pattern: `(foo)?bar`
///
/// IMPORTANT: syntext's HIR walker correctly extracts `Grams("bar")` only,
/// NOT `And(Grams("foo"), Grams("bar"))`.
///
/// Rationale: `foo` is optional. Requiring it in the gram query would
/// produce false negatives for inputs like "bazbar" or "quxbar". The index
/// must return all candidates containing "bar"; the verifier then confirms
/// that each candidate actually matches `(foo)?bar` as a regex.
///
/// Do NOT "fix" this to also require `foo`. That would be a correctness bug.
/// This test explicitly verifies that behavior does not regress.
#[test]
fn optional_prefix_pattern() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    // rg with regex `(foo)?bar` finds all lines containing `bar` or `foobar`
    let rg_result = rg_matches(&corpus, "(foo)?bar", &[]);

    // All rg results contain "bar" (with or without "foo" prefix).
    // Also count bare "bar" matches to show the optional case fires.
    let bar_only = rg_matches(&corpus, r"\bbar\b", &[]);
    let foobar = rg_matches(&corpus, "foobar", &[]);

    // The full result set is the union; foobar is a strict subset.
    let _ = (bar_only, foobar); // checked for clarity

    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "(foo)?bar", false, None);
    // Compatibility with rg: optional prefixes are the key correctness trap
    // here, so this should match rg exactly after verification.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "(foo)?bar");
    drop(index);
}

/// `parse.*batch`: the `.*` contributes All which simplifies away, leaving
/// And(Grams("parse"), Grams("batch")) as an indexed regex query.
/// syntext must still match rg exactly after verification.
#[test]
fn dot_star_fallback_to_scan() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_result = rg_matches(&corpus, "parse.*batch", &[]);
    assert!(
        !rg_result.is_empty(),
        "fixture invariant: parse.*batch must match at least 1 file \
         (long_line.txt has 'parse_query' and 'process_batch' on the same line)"
    );
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse.*batch", false, None);
    // Compatibility with rg: even when routing falls back to a broader scan,
    // the final verified results should still be exact.
    assert_exact_match(&corpus, &rg_result, &syntext_result, "parse.*batch");
    drop(index);
}

/// Path filter: `parse_query` restricted to `*.py` files.
///
/// Must only return Python files, not Rust, Go, etc.
#[test]
fn path_filter_py_only() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let rg_all = rg_matches(&corpus, "parse_query", &[]);
    let rg_py = rg_matches(&corpus, "parse_query", &["--glob=*.py"]);

    assert!(
        rg_py.len() < rg_all.len(),
        "py filter must return fewer results than unfiltered"
    );
    // All py results must have .py extension
    for m in &rg_py {
        assert!(
            m.path.ends_with(".py"),
            "path filter returned non-.py file: {}",
            m.path
        );
    }
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse_query", false, Some("*.py"));
    // Compatibility with rg: keep this as an inclusion check because syntext's
    // file_type/path_filter split is intentionally narrower than rg's glob engine.
    for m in &syntext_result {
        assert!(
            m.path.ends_with(".py"),
            "syntext path filter returned non-.py file: {}",
            m.path
        );
    }
    assert_no_false_negatives(
        &corpus,
        &rg_py,
        &syntext_result,
        "parse_query (*.py filter)",
    );
    drop(index);
}

/// Gitignore: `build/output.txt` must not appear in results.
///
/// The corpus `.gitignore` ignores `build/`. The indexer must respect it.
#[test]
fn gitignore_excludes_build_dir() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    // rg respects .gitignore by default; build/ should be excluded
    let rg_result = rg_matches(&corpus, "parse_query", &[]);
    for m in &rg_result {
        assert!(
            !m.path.starts_with("build/") && !m.path.contains("/build/"),
            "rg returned gitignored file: {}",
            m.path
        );
    }
    let (_tmp, index) = build_test_index(&corpus);
    let syntext_result = syntext_matches(&index, &corpus, "parse_query", false, None);
    for m in &syntext_result {
        assert!(
            !m.path.starts_with("build/") && !m.path.contains("/build/"),
            "syntext returned gitignored file: {}",
            m.path
        );
    }
    drop(index);
}

/// Size guard: a file that grew beyond max_file_size after indexing must be
/// skipped entirely rather than verified against truncated content.
///
/// Policy: skip oversized files. Searching only the first max_file_size bytes
/// would silently miss matches in the remaining content (false negatives), which
/// is unacceptable for use cases such as secret scanning. Matches that were in
/// the original indexed content are an acceptable sacrifice for correctness;
/// the caller should re-index or increase max_file_size.
#[test]
fn oversized_file_after_growth_returns_no_results() {
    let _guard = correctness_build_lock().lock().unwrap();
    let repo = tempfile::TempDir::new().unwrap();
    let index_dir = tempfile::TempDir::new().unwrap();

    let file = repo.path().join("big.rs");
    std::fs::write(&file, "fn unique_sentinel_xyzzy() {}\n").unwrap();

    let config = Config {
        index_dir: index_dir.path().to_path_buf(),
        repo_root: repo.path().to_path_buf(),
        max_file_size: 1024,
        ..Config::default()
    };
    let index = syntext::index::Index::build(config.clone()).unwrap();

    // Baseline: found before growth.
    let opts = SearchOptions::default();
    let results = index.search("unique_sentinel_xyzzy", &opts).unwrap();
    assert_eq!(
        results.len(),
        1,
        "baseline: must find match in original file"
    );
    drop(index);

    // Bloat the file past max_file_size; the sentinel is still near the start
    // but the file is now oversized so the resolver must skip it entirely.
    let mut bloated = String::from("fn unique_sentinel_xyzzy() {}\n");
    bloated.push_str(&"x".repeat(2048));
    std::fs::write(&file, &bloated).unwrap();

    // Re-open index (fresh snapshot); oversized file must be skipped.
    let index2 = syntext::index::Index::open(config).unwrap();
    let results2 = index2.search("unique_sentinel_xyzzy", &opts).unwrap();
    assert_eq!(
        results2.len(),
        0,
        "oversized file must be skipped entirely, not verified against truncated content"
    );
    drop(index2);
}

/// Symlink escape: a symlink replacing an indexed file must not leak content
/// from outside the repo root.
#[cfg(unix)]
#[test]
fn search_does_not_follow_symlink_outside_repo() {
    use std::os::unix::fs::symlink;

    let _guard = correctness_build_lock().lock().unwrap();

    let repo = tempfile::TempDir::new().unwrap();
    let index_dir = tempfile::TempDir::new().unwrap();
    let outside = tempfile::TempDir::new().unwrap();

    // Create a real file in the repo and index it.
    let real_file = repo.path().join("data.rs");
    std::fs::write(&real_file, "fn real_content() {}\n").unwrap();

    let config = Config {
        index_dir: index_dir.path().to_path_buf(),
        repo_root: repo.path().to_path_buf(),
        ..Config::default()
    };
    // Build index, then drop to release exclusive lock before re-opening.
    drop(Index::build(config.clone()).unwrap());

    // Create a secret file outside the repo.
    let secret = outside.path().join("secret.txt");
    std::fs::write(&secret, "fn real_content() {}\n").unwrap();

    // Replace the indexed file with a symlink to the secret.
    std::fs::remove_file(&real_file).unwrap();
    symlink(&secret, &real_file).unwrap();

    // Re-open index. The segment still references "data.rs" but the
    // file is now a symlink escaping the repo root.
    let index2 = Index::open(config).unwrap();
    let opts = SearchOptions::default();
    let results = index2.search("real_content", &opts).unwrap();

    // Should NOT return results from the symlinked file.
    assert!(
        results.is_empty(),
        "search should not follow symlinks that escape repo root"
    );
}

/// Verify the rg oracle itself produces consistent results across two runs.
///
/// This is a meta-test: if rg is non-deterministic on this corpus, the
/// oracle harness is unreliable and we need to investigate.
#[test]
fn oracle_is_deterministic() {
    let _guard = correctness_build_lock().lock().unwrap();
    if !rg_available() {
        eprintln!("SKIP: rg not on PATH");
        return;
    }
    assert_rg_version_pinned();
    let corpus = corpus_path();
    let run1 = rg_matches(&corpus, "parse_query", &[]);
    let run2 = rg_matches(&corpus, "parse_query", &[]);
    assert_eq!(
        run1, run2,
        "rg produced different results on two consecutive runs"
    );
}