haz-query 0.1.0

Query evaluator over haz task DAGs.
Documentation
//! Sound glob-against-glob intersection emptiness check.
//!
//! Per `QRY-003`, the `--inputs` and `--outputs` filters require
//! a non-empty intersection between two path patterns. The
//! literal arms (literal/literal byte-equal, literal/glob via
//! `globset::Glob`) are decidable in constant or near-constant
//! time. The remaining glob/glob arm is the regular-language
//! emptiness check on the product of two regular languages,
//! which the spec calls out as decidable and which this module
//! implements.
//!
//! The approach: translate each glob to its anchored regex form
//! (via [`globset::Glob::regex`]), build a byte-level
//! [`regex_automata::dfa::dense::DFA`] for each side, and run a
//! breadth-first traversal of the product state space. A pair
//! of states `(s1, s2)` accepts iff both DFAs reach a match
//! state under the end-of-input transition; if any such pair is
//! reachable from the start pair, the intersection is non-empty.
//!
//! The traversal visits each product state at most once, so its
//! cost is bounded by `|S1| x |S2|` for the two glob-derived
//! DFAs. For typical user-written patterns the state counts
//! stay small.

use std::collections::{HashSet, VecDeque};

use regex_automata::dfa::Automaton;
use regex_automata::dfa::dense;
use regex_automata::util::primitives::StateID;
use regex_automata::util::syntax;
use regex_automata::{Anchored, Input};
use snafu::Snafu;

/// Failure encountered while testing two globs for intersection.
///
/// Both variants reflect internal regex-automata limits; for any
/// glob pattern accepted by `globset` 0.4 the DFA build succeeds.
/// We surface the failure rather than panic so that pathological
/// patterns degrade gracefully into a typed CLI error.
#[derive(Debug, Clone, PartialEq, Eq, Snafu)]
pub enum GlobIntersectError {
    /// The regex-automata DFA builder rejected one of the
    /// glob-derived regex strings.
    #[snafu(display("could not build DFA for glob '{glob}': {message}"))]
    BuildDfa {
        /// The offending glob's canonical text.
        glob: String,
        /// A stringified `regex_automata::dfa::dense::BuildError`.
        /// Kept as a `String` because the typed error is large
        /// (152 bytes) and inflates the enclosing `Result`.
        message: String,
    },

    /// The DFA's start-state lookup failed (rare; would imply
    /// regex-automata cannot resolve the anchored start for one
    /// of the input patterns).
    #[snafu(display("could not derive start state for glob '{glob}': {message}"))]
    StartState {
        /// The offending glob's canonical text.
        glob: String,
        /// A human-readable reason from the underlying lookup.
        message: String,
    },
}

/// Returns `true` iff there exists at least one path that
/// matches both `lhs` and `rhs`.
///
/// The check is byte-level and tracks the product of two
/// anchored DFAs; it agrees with `globset`'s own matcher on
/// every individual side, and the breadth-first traversal is
/// sound (no false positives, no false negatives) over the
/// regular languages encoded by the glob patterns.
///
/// # Errors
///
/// Returns [`GlobIntersectError`] when the DFA construction or
/// anchored-start lookup fails for one of the patterns. These
/// failures are not expected for well-formed `globset` patterns
/// in practice; they MAY occur if a pattern exceeds
/// regex-automata's default state-count budget.
pub fn glob_intersect_non_empty(
    lhs: &globset::Glob,
    rhs: &globset::Glob,
) -> Result<bool, GlobIntersectError> {
    let dfa_lhs = build_dfa(lhs)?;
    let dfa_rhs = build_dfa(rhs)?;
    bfs_product(&dfa_lhs, &dfa_rhs, lhs, rhs)
}

fn build_dfa(glob: &globset::Glob) -> Result<dense::DFA<Vec<u32>>, GlobIntersectError> {
    // globset emits byte-level regexes (with the `(?-u)` flag)
    // whose `.*` can match non-UTF-8 bytes. The DFA syntax
    // translator rejects that unless `utf8(false)` is set on
    // the syntax config.
    dense::Builder::new()
        .syntax(syntax::Config::new().utf8(false))
        .build(glob.regex())
        .map_err(|e| GlobIntersectError::BuildDfa {
            glob: glob.glob().to_owned(),
            message: format!("{e:?}"),
        })
}

fn bfs_product(
    lhs: &dense::DFA<Vec<u32>>,
    rhs: &dense::DFA<Vec<u32>>,
    lhs_glob: &globset::Glob,
    rhs_glob: &globset::Glob,
) -> Result<bool, GlobIntersectError> {
    let empty_input = Input::new(b"").anchored(Anchored::Yes);

    let start_lhs =
        lhs.start_state_forward(&empty_input)
            .map_err(|e| GlobIntersectError::StartState {
                glob: lhs_glob.glob().to_owned(),
                message: e.to_string(),
            })?;
    let start_rhs =
        rhs.start_state_forward(&empty_input)
            .map_err(|e| GlobIntersectError::StartState {
                glob: rhs_glob.glob().to_owned(),
                message: e.to_string(),
            })?;

    let mut visited: HashSet<(StateID, StateID)> = HashSet::new();
    let mut queue: VecDeque<(StateID, StateID)> = VecDeque::new();
    visited.insert((start_lhs, start_rhs));
    queue.push_back((start_lhs, start_rhs));

    while let Some((cur_lhs, cur_rhs)) = queue.pop_front() {
        if accepts_at_eoi(lhs, cur_lhs) && accepts_at_eoi(rhs, cur_rhs) {
            return Ok(true);
        }

        if lhs.is_dead_state(cur_lhs) || rhs.is_dead_state(cur_rhs) {
            continue;
        }

        for byte in 0u8..=255u8 {
            let next_lhs = lhs.next_state(cur_lhs, byte);
            let next_rhs = rhs.next_state(cur_rhs, byte);
            if lhs.is_dead_state(next_lhs) || rhs.is_dead_state(next_rhs) {
                continue;
            }
            let pair = (next_lhs, next_rhs);
            if visited.insert(pair) {
                queue.push_back(pair);
            }
        }
    }

    Ok(false)
}

fn accepts_at_eoi(dfa: &dense::DFA<Vec<u32>>, state: StateID) -> bool {
    if dfa.is_match_state(state) {
        return true;
    }
    let after_eoi = dfa.next_eoi_state(state);
    dfa.is_match_state(after_eoi)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn glob(pattern: &str) -> globset::Glob {
        globset::GlobBuilder::new(pattern)
            .literal_separator(true)
            .case_insensitive(false)
            .build()
            .expect("test pattern must compile")
    }

    fn intersects(a: &str, b: &str) -> bool {
        glob_intersect_non_empty(&glob(a), &glob(b)).expect("DFA build must succeed")
    }

    #[test]
    fn qry_003_identical_globs_intersect() {
        assert!(intersects("src/**/*.rs", "src/**/*.rs"));
    }

    #[test]
    fn qry_003_overlapping_globs_intersect() {
        // Any path matching `src/lib/*.rs` also matches
        // `src/**/*.rs` (e.g. `src/lib/foo.rs`).
        assert!(intersects("src/**/*.rs", "src/lib/*.rs"));
    }

    #[test]
    fn qry_003_overlapping_globs_intersect_other_direction() {
        assert!(intersects("src/lib/*.rs", "src/**/*.rs"));
    }

    #[test]
    fn qry_003_disjoint_glob_prefixes_do_not_intersect() {
        // No path can begin with both `src/` and `lib/`.
        assert!(!intersects("src/**/*.rs", "lib/**/*.rs"));
    }

    #[test]
    fn qry_003_disjoint_extensions_do_not_intersect() {
        assert!(!intersects("**/*.rs", "**/*.js"));
    }

    #[test]
    fn qry_003_char_class_intersection_is_decidable() {
        // `src/[abc].rs` and `src/b.rs` overlap on `b`.
        assert!(intersects("src/[abc].rs", "src/[bd].rs"));
    }

    #[test]
    fn qry_003_disjoint_char_classes_do_not_intersect() {
        assert!(!intersects("src/[ab].rs", "src/[cd].rs"));
    }

    #[test]
    fn qry_003_question_mark_overlap_decidable() {
        // `src/?.rs` matches single-char names; `src/a.rs`
        // matches one specific single-char name; they
        // intersect on `src/a.rs`.
        assert!(intersects("src/?.rs", "src/*.rs"));
    }

    #[test]
    fn qry_003_double_star_absorbs_single_star_prefix() {
        assert!(intersects("src/**/*.rs", "src/*/*.rs"));
    }

    #[test]
    fn qry_003_double_star_does_not_overflow_into_disjoint_root() {
        assert!(!intersects("src/**/*.rs", "lib/**/*.rs"));
    }

    #[test]
    fn qry_003_alternation_overlap_decidable() {
        // `src/{a,b}.rs` overlaps `src/{b,c}.rs` on `src/b.rs`.
        assert!(intersects("src/{a,b}.rs", "src/{b,c}.rs"));
    }

    #[test]
    fn qry_003_alternation_with_no_overlap_is_disjoint() {
        assert!(!intersects("src/{a,b}.rs", "src/{c,d}.rs"));
    }

    #[test]
    fn qry_003_workspace_absolute_vs_workspace_absolute() {
        assert!(intersects("/lib_core/src/**/*.rs", "/lib_core/src/foo.rs",));
    }

    #[test]
    fn qry_003_workspace_absolute_disjoint_roots() {
        assert!(!intersects("/lib_core/**/*.rs", "/web/**/*.rs"));
    }

    #[test]
    fn qry_003_literal_glob_vs_literal_glob_byte_equal() {
        // No wildcards on either side; the "globs" are
        // literals. Equal literals intersect.
        assert!(intersects("src/main.rs", "src/main.rs"));
    }

    #[test]
    fn qry_003_literal_glob_vs_literal_glob_byte_inequal() {
        assert!(!intersects("src/main.rs", "src/lib.rs"));
    }
}