ccd-cli 1.0.0-beta.4

pub(crate) const SURFACED_REFERENCE_EXTENSIONS: &[&str] = &[
    ".rs", ".md", ".toml", ".json", ".sh", ".yml", ".yaml", ".py", ".ts", ".tsx", ".go", ".txt",
    ".sql", ".rb", ".java", ".c", ".cpp", ".h", ".hpp", ".html", ".css", ".lock",
];

// NOTE: `CHANGELOG` was removed from this list (see ccd#585). Prose
// references to "the changelog" are almost always references to
// `CHANGELOG.md`, not to a bare extensionless file, and the bare form
// produced high-volume false positives at session start.
pub(crate) const SURFACED_EXTENSIONLESS_FILENAMES: &[&str] = &[
    "Dockerfile",
    "Makefile",
    "LICENSE",
    "COPYING",
    "NOTICE",
    "README",
    "CONTRIBUTING",
    "AUTHORS",
    "MAINTAINERS",
    "CODEOWNERS",
    "Gemfile",
    "Rakefile",
    "Procfile",
];

/// Repo path prefixes that authors use in prose to name an in-repo
/// location (`src/commands/host.rs`, `.github/workflows/...`). Used by
/// the strict unquoted-prose detector: unquoted slashed tokens must
/// begin with one of these (or an absolute / relative anchor) to be
/// treated as a path candidate.
const REPO_PATH_PREFIXES: &[&str] = &[
    "src/",
    "docs/",
    "tests/",
    "test/",
    "scripts/",
    "skills/",
    "specs/",
    "templates/",
    "examples/",
    "crates/",
    "benches/",
    "assets/",
    "migrations/",
    "lib/",
    "bin/",
    "cmd/",
    "pkg/",
    "internal/",
    "app/",
    "config/",
    ".github/",
    ".claude/",
    ".gemini/",
    ".agents/",
    ".ccd/",
    ".ccd-hosts/",
];

fn has_anchor_or_repo_prefix(token: &str) -> bool {
    if token.starts_with('/') || token.starts_with("./") || token.starts_with("../") {
        return true;
    }
    REPO_PATH_PREFIXES
        .iter()
        .any(|prefix| token.starts_with(prefix))
}

fn ends_with_known_extension(token: &str) -> bool {
    SURFACED_REFERENCE_EXTENSIONS
        .iter()
        .any(|ext| token.ends_with(ext))
}

/// Reject bare extension tokens (`.rs`, `.md`) that carry no filename
/// stem. Such tokens drop out of prose like "touch .rs files" and
/// were previously flagged as missing references.
fn has_nonempty_stem_before_extension(token: &str) -> bool {
    for ext in SURFACED_REFERENCE_EXTENSIONS {
        if let Some(stem) = token.strip_suffix(ext) {
            return !stem.is_empty() && stem.chars().any(|c| c != '/');
        }
    }
    true
}

fn contains_unresolved_placeholder(token: &str) -> bool {
    if token.contains("YYYY") || token.contains("MM-DD") {
        return true;
    }
    if token.contains("XXXX") {
        return true;
    }
    if contains_repeat_run_in_filename_context(token, b'N', 3) {
        return true;
    }
    let bytes = token.as_bytes();
    let digit_context = bytes.iter().any(|b| b.is_ascii_digit());
    if !digit_context {
        return false;
    }
    let mut i = 0;
    while i + 2 <= bytes.len() {
        if &bytes[i..i + 2] == b"XX" {
            let before = i.checked_sub(1).map(|j| bytes[j]);
            let after = bytes.get(i + 2).copied();
            let sep_before = matches!(before, Some(b'-') | Some(b'_'));
            let sep_after = matches!(after, Some(b'-') | Some(b'_') | Some(b'.') | Some(b'/'));
            if sep_before && sep_after {
                return true;
            }
        }
        i += 1;
    }
    false
}

/// Detect a run of exactly `target` repeated placeholder chars (e.g.
/// `NNN`) that is bounded by filename separators. Used to catch
/// `NNN-short-slug.md` while leaving words like "inn" or "running"
/// alone.
fn contains_repeat_run_in_filename_context(token: &str, ch: u8, target: usize) -> bool {
    let bytes = token.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] != ch {
            i += 1;
            continue;
        }
        let mut j = i;
        while j < bytes.len() && bytes[j] == ch {
            j += 1;
        }
        let run_len = j - i;
        if run_len == target {
            let before = i.checked_sub(1).map(|k| bytes[k]);
            let after = bytes.get(j).copied();
            let sep_before =
                before.is_none() || matches!(before, Some(b'-') | Some(b'_') | Some(b'/'));
            let sep_after = matches!(
                after,
                Some(b'-') | Some(b'_') | Some(b'.') | Some(b'/') | None
            );
            if sep_before && sep_after {
                return true;
            }
        }
        i = j;
    }
    false
}

/// Reject `/namespace:command` slash-command identifiers (`/codex:review`,
/// `/codex:rescue`). These start with `/` so they trip the absolute-path
/// branch, but the `:` inside the first segment marks them as commands,
/// not filesystem paths.
fn looks_like_slash_command(token: &str) -> bool {
    let Some(rest) = token.strip_prefix('/') else {
        return false;
    };
    let first_segment = match rest.split_once('/') {
        Some((seg, _)) => seg,
        None => rest,
    };
    let Some((ns, cmd)) = first_segment.split_once(':') else {
        return false;
    };
    let valid = |seg: &str| {
        !seg.is_empty()
            && seg
                .chars()
                .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
    };
    valid(ns) && valid(cmd)
}

/// Permissive path-reference detector. Use for tokens the author has
/// marked as a reference — `key_files` entries and backtick-quoted
/// prose tokens. Quoting is an explicit authoring signal; the
/// stem-length and placeholder guards still apply to block obviously
/// malformed references (`.rs`, `YYYY-MM-DD-template.md`).
fn looks_like_surfaced_reference(token: &str) -> bool {
    if token.len() < 2 {
        return false;
    }
    if token.starts_with("http://") || token.starts_with("https://") {
        return false;
    }
    if contains_unresolved_placeholder(token) {
        return false;
    }
    let has_known_extension = ends_with_known_extension(token);
    if has_known_extension && !has_nonempty_stem_before_extension(token) {
        return false;
    }
    if token.contains('/') {
        return has_known_extension || has_anchor_or_repo_prefix(token);
    }
    if has_known_extension {
        return true;
    }
    SURFACED_EXTENSIONLESS_FILENAMES.contains(&token)
}

/// Strict path-reference detector for **unquoted** prose tokens.
///
/// Authors write slashed and extension-bearing phrases in prose
/// (`retire/rename/defend`, `Host/vendor names`, `radar/context_check.rs`,
/// `2026-04-16-my-note.md`) that are not repo path references. Only
/// accept a token as a candidate when the intent is unambiguous:
///
/// - absolute or relative path prefix (`/`, `./`, `../`)
/// - known repo-local prefix (`src/`, `docs/`, `.claude/`, ...)
/// - unslashed bare token from the narrow extensionless filename
///   allowlist (`Dockerfile`, `Makefile`, `LICENSE`, ...)
///
/// Bare extensioned filenames (`backlog.md`, `2026-04-16-my-note.md`)
/// and unprefixed slashed paths (`radar/context_check.rs`,
/// `vendor/cli/README`) are rejected here. Authors who want those
/// validated should quote them in backticks or list them in
/// `key_files`; both paths use the permissive detector.
fn looks_like_unquoted_prose_path(token: &str) -> bool {
    if token.len() < 2 {
        return false;
    }
    if token.starts_with("http://") || token.starts_with("https://") {
        return false;
    }
    if contains_unresolved_placeholder(token) {
        return false;
    }
    if looks_like_slash_command(token) {
        return false;
    }
    if ends_with_known_extension(token) && !has_nonempty_stem_before_extension(token) {
        return false;
    }
    if has_anchor_or_repo_prefix(token) {
        return true;
    }
    if token.contains('/') {
        return false;
    }
    SURFACED_EXTENSIONLESS_FILENAMES.contains(&token)
}

fn strip_sentence_period(text: &str) -> &str {
    let mut chars = text.char_indices().rev();
    let Some((last_idx, last_ch)) = chars.next() else {
        return text;
    };
    if last_ch != '.' {
        return text;
    }
    match chars.next() {
        Some((_, prev))
            if prev.is_alphanumeric() || matches!(prev, '`' | '"' | '\'' | ')' | ']' | '}') =>
        {
            &text[..last_idx]
        }
        _ => text,
    }
}

fn normalize_prose_token(token: &str) -> &str {
    let mut current = token;
    loop {
        let stripped = current
            .trim_matches(|c: char| matches!(c, '`' | '"' | '\'' | ',' | ';' | ':' | '!' | '?'));
        let stripped = strip_sentence_period(stripped);
        if stripped == current {
            return current;
        }
        current = stripped;
    }
}

pub(crate) fn collect_prose_path_candidates(text: &str, out: &mut Vec<String>) {
    // Pass 1: backtick-quoted tokens are an explicit authoring signal.
    // Use the permissive detector so `` `backlog.md` `` and
    // `` `Dockerfile` `` round-trip, even though their unquoted
    // equivalents would be rejected as ambiguous prose.
    let mut cursor = 0usize;
    while let Some(open_rel) = text[cursor..].find('`') {
        let start = cursor + open_rel + 1;
        let Some(close_rel) = text[start..].find('`') else {
            break;
        };
        let end = start + close_rel;
        let quoted = &text[start..end];
        let normalized = normalize_prose_token(quoted);
        if looks_like_surfaced_reference(normalized) {
            out.push(normalized.to_owned());
        }
        cursor = end + 1;
    }

    // Pass 2: unquoted tokens require stronger signals to avoid
    // flagging natural-language slashes, bare extensioned filenames,
    // and Rust module notation.
    for raw in text.split(|c: char| {
        c.is_whitespace() || matches!(c, ',' | ';' | '(' | ')' | '[' | ']' | '{' | '}')
    }) {
        if raw.contains('`') {
            // Token overlaps a backtick pair and has already been
            // considered by Pass 1.
            continue;
        }
        if raw.contains("::") {
            // Rust module notation (`radar/context_check.rs::build_X`)
            // is a code locator, not a filesystem path.
            continue;
        }
        let trimmed = normalize_prose_token(raw);
        if looks_like_unquoted_prose_path(trimmed) {
            out.push(trimmed.to_owned());
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn collect(text: &str) -> Vec<String> {
        let mut out: Vec<String> = Vec::new();
        collect_prose_path_candidates(text, &mut out);
        out
    }

    // Regressions: genuine prose path references should still be collected.
    #[test]
    fn collects_known_root_with_extension() {
        assert_eq!(
            collect("see src/state/consistency.rs"),
            vec!["src/state/consistency.rs"]
        );
    }

    #[test]
    fn collects_docs_path_with_extension() {
        assert_eq!(
            collect("mentioned in docs/dev/2026-04-15-review.md"),
            vec!["docs/dev/2026-04-15-review.md"]
        );
    }

    #[test]
    fn collects_leading_dot_slash_anchor() {
        assert_eq!(
            collect("run ./scripts/build.sh"),
            vec!["./scripts/build.sh"]
        );
    }

    #[test]
    fn collects_bare_known_extensionless_filename() {
        assert_eq!(collect("update the Dockerfile now"), vec!["Dockerfile"]);
    }

    #[test]
    fn collects_backtick_quoted_bare_extensioned_filename() {
        assert_eq!(collect("check `backlog.md` first"), vec!["backlog.md"]);
    }

    #[test]
    fn collects_backtick_quoted_unprefixed_slashed_path() {
        assert_eq!(
            collect("see `radar/context_check.rs` in review"),
            vec!["radar/context_check.rs"]
        );
    }

    // Existing fixes for #526: slash-in-prose and placeholder-date false positives.
    #[test]
    fn skips_slash_separated_identifier_without_extension_or_root() {
        assert!(collect("state, paths, session, start/dispatch, protected_write").is_empty());
    }

    #[test]
    fn skips_natural_language_slash_noun_phrase() {
        assert!(collect("Host/vendor names vary").is_empty());
    }

    #[test]
    fn skips_slash_separated_verb_listing() {
        assert!(collect("retire/rename/defend the surface").is_empty());
    }

    #[test]
    fn skips_slash_separated_identifier_listing() {
        assert!(collect("execution_gates/lease/escalation flow").is_empty());
    }

    #[test]
    fn skips_bare_extensioned_filename_in_prose() {
        assert!(collect("write 2026-04-16-my-note.md later").is_empty());
    }

    #[test]
    fn skips_unresolved_xx_day_placeholder_in_dated_filename() {
        assert!(collect("write docs/dev/2026-04-XX-kernel-review-summary.md later").is_empty());
    }

    #[test]
    fn skips_xxxx_xx_xx_placeholder() {
        assert!(collect("docs/dev/XXXX-XX-XX-template.md placeholder").is_empty());
    }

    #[test]
    fn skips_yyyy_mm_dd_placeholder() {
        assert!(collect("docs/dev/YYYY-MM-DD-template.md").is_empty());
    }

    // #585: new regression guards.
    #[test]
    fn skips_bare_extension_token() {
        assert!(collect("touch .rs files in the tree").is_empty());
        assert!(collect(".md").is_empty());
    }

    #[test]
    fn skips_bare_extension_token_even_when_backticked() {
        assert!(collect("we added `.rs` files").is_empty());
    }

    #[test]
    fn skips_bare_changelog_reference() {
        // Previously emitted because `CHANGELOG` was in the extensionless
        // allowlist; prose almost always means CHANGELOG.md.
        assert!(collect("update the CHANGELOG now").is_empty());
    }

    #[test]
    fn still_collects_backticked_changelog_md() {
        assert_eq!(
            collect("update `CHANGELOG.md` for the release"),
            vec!["CHANGELOG.md"]
        );
    }

    #[test]
    fn skips_nnn_placeholder_in_dated_filename() {
        assert!(collect("write docs/dev/NNN-short-slug.md later").is_empty());
    }

    #[test]
    fn skips_nnn_placeholder_bare() {
        assert!(collect("template NNN-slug.md is a placeholder").is_empty());
    }

    #[test]
    fn does_not_treat_nn_or_nnnn_as_nnn_placeholder() {
        // Two Ns ("CNN") or four Ns should not trigger the NNN guard.
        assert_eq!(
            collect("docs/dev/CNN-report.md exists"),
            vec!["docs/dev/CNN-report.md"]
        );
    }

    #[test]
    fn skips_rust_module_notation() {
        assert!(
            collect("trace radar/context_check.rs::build_context_check_decision for detail")
                .is_empty()
        );
    }

    #[test]
    fn skips_unprefixed_slashed_rust_path_in_unquoted_prose() {
        // Per issue #585: `radar/context_check.rs` mentioned bare in
        // prose should not be treated as a repo path (real file lives
        // at src/state/radar/context_check.rs). Authors who want the
        // bare path validated can backtick it.
        assert!(collect("see radar/context_check.rs for detail").is_empty());
    }

    #[test]
    fn skips_slash_command_identifier() {
        assert!(collect("run /codex:review before merging").is_empty());
        assert!(collect("run /codex:rescue if blocked").is_empty());
    }

    #[test]
    fn does_not_confuse_slash_command_with_absolute_path() {
        // A real absolute path (no `:` in first segment) must still be
        // treated as a path candidate.
        assert_eq!(
            collect("ship /usr/local/bin/ccd with the package"),
            vec!["/usr/local/bin/ccd"]
        );
    }
}

/// Inflected forms of the "create-action" verb families recognized by
/// [`contains_create_verb`]. When a handoff prose line contains any of
/// these as a whole word, the drift detectors treat path references in
/// that line as **forward-references** (expected outputs of the session)
/// rather than stale inputs. The list is intentionally explicit rather
/// than rule-driven so that irregular forms (`wrote`, `written`) are
/// covered without risking false positives from generic stemming.
///
/// See ccd#597 — forward-references to not-yet-authored deliverables
/// were firing the existence-audit drift signal and flipping
/// `session_boundary.action` to `stop` on clean in-progress sessions.
pub(crate) const CREATE_VERB_FORMS: &[&str] = &[
    // author
    "author",
    "authors",
    "authored",
    "authoring", //
    // write
    "write",
    "writes",
    "writing",
    "wrote",
    "written", //
    // publish
    "publish",
    "publishes",
    "publishing",
    "published", //
    // create
    "create",
    "creates",
    "creating",
    "created", //
    // produce
    "produce",
    "produces",
    "producing",
    "produced", //
    // draft
    "draft",
    "drafts",
    "drafting",
    "drafted", //
    // generate
    "generate",
    "generates",
    "generating",
    "generated", //
    // emit
    "emit",
    "emits",
    "emitting",
    "emitted", //
    // compose
    "compose",
    "composes",
    "composing",
    "composed", //
    // land (idiomatic in this repo: "land PR", "land the slice")
    "land",
    "lands",
    "landing",
    "landed", //
    // open (idiomatic: "open a PR", "open the doc")
    "open",
    "opens",
    "opening",
    "opened", //
];

/// Inflected forms of the "input-action" verb families. When one of
/// these governs a path reference, that path is an **input** the session
/// depends on — the existence audit must still fire so missing
/// prerequisites block resume. Mirrors [`CREATE_VERB_FORMS`].
pub(crate) const INPUT_VERB_FORMS: &[&str] = &[
    // read
    "read",
    "reads",
    "reading", //
    // inspect
    "inspect",
    "inspects",
    "inspecting",
    "inspected", //
    // review
    "review",
    "reviews",
    "reviewing",
    "reviewed", //
    // check
    "check",
    "checks",
    "checking",
    "checked", //
    // analyze
    "analyze",
    "analyzes",
    "analyzing",
    "analyzed", //
    // examine
    "examine",
    "examines",
    "examining",
    "examined", //
    // load
    "load",
    "loads",
    "loading",
    "loaded", //
    // fetch
    "fetch",
    "fetches",
    "fetching",
    "fetched", //
    // ingest
    "ingest",
    "ingests",
    "ingesting",
    "ingested", //
    // update / modify / edit / patch — the file must already exist
    "update",
    "updates",
    "updating",
    "updated", //
    "modify",
    "modifies",
    "modifying",
    "modified", //
    "edit",
    "edits",
    "editing",
    "edited", //
    "patch",
    "patches",
    "patching",
    "patched", //
];

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum VerbKind {
    Create,
    Input,
}

/// Characters that can appear inside a path token. Used by
/// [`find_token_at_path_boundary`] to avoid matching a token at an
/// occurrence where it is actually the suffix of a larger path.
fn is_path_char(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'/' | b'.' | b'-' | b'#')
}

/// Locate `token` inside `text` at an occurrence whose preceding byte
/// is not a path character. This treats `input.md` in
/// `Generate docs/generated-input.md after reading input.md` as a
/// standalone token match (preceded by a space), not as a suffix of
/// the create-output path (preceded by `-`).
fn find_token_at_path_boundary(text: &str, token: &str) -> Option<usize> {
    if token.is_empty() {
        return None;
    }
    let bytes = text.as_bytes();
    let mut search_start = 0usize;
    while search_start + token.len() <= text.len() {
        let rel = text[search_start..].find(token)?;
        let abs = search_start + rel;
        let boundary_before = abs == 0 || !is_path_char(bytes[abs - 1]);
        if boundary_before {
            return Some(abs);
        }
        // Advance past the current match along a char boundary. The
        // byte immediately after `abs` may sit inside a multi-byte
        // char when `token` begins with ASCII but the preceding prose
        // wrapped it with non-ASCII punctuation.
        let char_len = text[abs..]
            .chars()
            .next()
            .map(|c| c.len_utf8())
            .unwrap_or(1);
        search_start = abs + char_len;
    }
    None
}

/// Returns true when the path `token` in `text` is governed by a
/// create-action verb and should be treated as a **forward-reference**
/// (planned output) rather than a missing input.
///
/// Analyzes `text` at clause granularity — splitting on `. `, `; `, `,`,
/// ` and `, ` but `, ` then ` — so mixed lines like
/// `Read docs/input.md and publish docs/output.md` still flag the input
/// path while suppressing the output path. When a clause has no
/// recognized verb, it inherits the verb kind from the prior clause so
/// enumerations like `Publish docs/a.md and docs/b.md` treat both items
/// as outputs.
///
/// Returns false when no create-verb governs the token (including the
/// "no verb found" case) so the drift signal keeps firing for missing
/// prerequisites.
pub(crate) fn token_is_forward_reference(text: &str, token: &str) -> bool {
    let clauses = split_clauses(text);
    if clauses.is_empty() {
        return false;
    }

    // Find which clause contains the token. `token` came from the
    // prose-candidate tokenizer of the same text, so a byte-level `find`
    // is reliable. Fall back to the last clause if the locate fails; the
    // per-clause verb walk still runs and may suppress correctly.
    // Locate the token at a word boundary so `input.md` audited from a
    // line like `Generate docs/generated-input.md after reading
    // input.md` is mapped to the standalone occurrence rather than the
    // suffix of the create-output path. Without this guard the verb
    // scan would run at the wrong position and inherit the create verb
    // for an unrelated audit.
    let Some(token_pos) = find_token_at_path_boundary(text, token) else {
        return clauses.iter().any(|c| {
            c.forced_kind.is_none() && clause_verb_kind_anywhere(c.text) == Some(VerbKind::Create)
        });
    };

    let mut cumulative = 0usize;
    let mut last_verb: Option<VerbKind> = None;
    let mut token_verb: Option<VerbKind> = None;
    for clause in &clauses {
        let clause_end = cumulative + clause.text.len();
        let contains_token = token_pos >= cumulative && token_pos < clause_end;

        // Dependency prepositions (` from `, ` using `, ` based on `, ...)
        // reset the clause kind to Input regardless of the enclosing
        // verb: `Generate docs/out.md from docs/in.md` flags the input
        // while still suppressing the create output.
        if let Some(forced) = clause.forced_kind {
            last_verb = Some(forced);
        }

        if contains_token {
            // Scan verbs only up to the token's position within this
            // clause so later input verbs can override earlier create
            // verbs for the token: `Generate docs/out.md after
            // reviewing docs/missing.md` flags the input path.
            let token_offset_in_clause = token_pos - cumulative;
            if let Some(kind) = clause_verb_kind_before(clause.text, token_offset_in_clause) {
                last_verb = Some(kind);
            }
            token_verb = last_verb;
            break;
        }

        // For clauses that do not contain the token, inherit based on
        // the full-clause verb kind so enumerations propagate correctly.
        if let Some(kind) = clause_verb_kind_anywhere(clause.text) {
            last_verb = Some(kind);
        }
        cumulative = clause_end;
    }
    matches!(token_verb, Some(VerbKind::Create))
}

/// Latest verb kind recognized within the first `upto_bytes` of
/// `clause`. Scanning up to the token's byte offset (rather than the
/// whole clause) lets later input verbs override earlier create verbs
/// *before the token*, so prose like
/// `Generate docs/out.md after reviewing docs/missing-source.md` flags
/// the input path — the `reviewing` verb appears between `generate`
/// and the input token and wins by virtue of being closer.
fn clause_verb_kind_before(clause: &str, upto_bytes: usize) -> Option<VerbKind> {
    let upper = upto_bytes.min(clause.len());
    let prefix = &clause[..upper];
    let lower = prefix.to_ascii_lowercase();
    let mut latest: Option<VerbKind> = None;
    for word in lower.split(|c: char| !c.is_ascii_alphabetic()) {
        if word.is_empty() {
            continue;
        }
        if CREATE_VERB_FORMS.contains(&word) {
            latest = Some(VerbKind::Create);
        } else if INPUT_VERB_FORMS.contains(&word) {
            latest = Some(VerbKind::Input);
        }
    }
    latest
}

/// Latest verb kind recognized anywhere in the clause. Used for
/// enumeration inheritance when a subsequent clause has no verb of its
/// own: `Publish docs/a.md and docs/b.md` sees `b.md` in clause 2 with
/// no local verb, inheriting Create from clause 1.
fn clause_verb_kind_anywhere(clause: &str) -> Option<VerbKind> {
    clause_verb_kind_before(clause, clause.len())
}

/// A clause returned by [`split_clauses`]. `forced_kind` captures the
/// semantic reset implied by the preceding separator: dependency
/// prepositions like ` from `, ` using `, ` based on ` introduce input
/// paths regardless of the enclosing verb, so the walker treats the
/// clause as if it had an implicit Input verb. Neutral separators
/// (`. `, `; `, `,`, ` and `, ` but `, ` then `) leave `forced_kind`
/// unset so the clause inherits the last seen verb kind.
struct Clause<'a> {
    text: &'a str,
    forced_kind: Option<VerbKind>,
}

/// Split prose into clauses while preserving total byte length so
/// callers can correlate byte offsets from the original text back to
/// individual clauses. The separator text stays attached to the end of
/// the preceding clause. Ordered longest-first so `based on` wins over
/// ` on ` (not a separator here, but the principle matters for future
/// extensions).
fn split_clauses(text: &str) -> Vec<Clause<'_>> {
    const NEUTRAL_SEPARATORS: &[&str] = &[". ", "; ", ", ", " and ", " but ", " then "];
    // Explicit dependency prepositions only. Ambiguous English
    // prepositions (`in`, `with`, `for`) are intentionally excluded:
    // `Create docs/out.md in docs/target-dir/` usually names the
    // target location (a forward reference), not a missing input, and
    // forcing Input there would reintroduce the original false-positive
    // class this fix is meant to close. Known trade-off: a missing
    // dependency after one of those ambiguous prepositions may still
    // be suppressed when a create verb is active. Explicit input verbs
    // (`read`, `review`, ...) and explicit dependency prepositions
    // below cover the cases where intent is clear.
    const INPUT_PREPOSITION_SEPARATORS: &[&str] = &[
        " based on ",
        " out of ",
        " from ",
        " using ",
        " against ",
        " via ",
    ];

    let mut clauses: Vec<Clause<'_>> = Vec::new();
    let mut start = 0usize;
    let mut pending_forced_kind: Option<VerbKind> = None;
    let mut idx = 0usize;
    while idx < text.len() {
        let tail = &text[idx..];
        let mut matched: Option<(usize, Option<VerbKind>)> = None;
        for sep in INPUT_PREPOSITION_SEPARATORS {
            if tail.starts_with(sep) {
                matched = Some((sep.len(), Some(VerbKind::Input)));
                break;
            }
        }
        if matched.is_none() {
            for sep in NEUTRAL_SEPARATORS {
                if tail.starts_with(sep) {
                    matched = Some((sep.len(), None));
                    break;
                }
            }
        }
        if let Some((sep_len, next_forced)) = matched {
            let clause_text = &text[start..idx + sep_len];
            clauses.push(Clause {
                text: clause_text,
                forced_kind: pending_forced_kind,
            });
            start = idx + sep_len;
            idx += sep_len;
            pending_forced_kind = next_forced;
        } else {
            // Advance by one full UTF-8 character so slicing at `idx`
            // always lands on a char boundary. Byte-level `idx += 1`
            // panics when the current char is multi-byte (em dashes,
            // non-ASCII punctuation).
            let char_len = tail.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
            idx += char_len;
        }
    }
    if start < text.len() {
        clauses.push(Clause {
            text: &text[start..],
            forced_kind: pending_forced_kind,
        });
    }
    if clauses.is_empty() && !text.is_empty() {
        clauses.push(Clause {
            text,
            forced_kind: None,
        });
    }
    clauses
}

#[cfg(test)]
mod forward_reference_tests {
    use super::*;

    #[test]
    fn suppresses_output_and_flags_input_in_mixed_clause_line() {
        let text = "Read docs/input.md and publish docs/output.md with the numbers.";
        assert!(!token_is_forward_reference(text, "docs/input.md"));
        assert!(token_is_forward_reference(text, "docs/output.md"));
    }

    #[test]
    fn inherits_create_verb_across_enumeration_clauses() {
        let text = "Publish docs/a.md and docs/b.md with the release notes.";
        assert!(token_is_forward_reference(text, "docs/a.md"));
        assert!(token_is_forward_reference(text, "docs/b.md"));
    }

    #[test]
    fn inherits_input_verb_across_enumeration_clauses() {
        let text = "Read docs/a.md and docs/b.md before committing.";
        assert!(!token_is_forward_reference(text, "docs/a.md"));
        assert!(!token_is_forward_reference(text, "docs/b.md"));
    }

    #[test]
    fn no_verb_means_not_a_forward_reference() {
        let text = "docs/orphan.md stays untouched.";
        assert!(!token_is_forward_reference(text, "docs/orphan.md"));
    }

    #[test]
    fn create_then_input_in_separate_sentences() {
        let text = "Publish docs/out.md. Then read docs/input.md before wrap-up.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(!token_is_forward_reference(text, "docs/input.md"));
    }

    #[test]
    fn dependency_preposition_from_flags_source_as_input() {
        let text = "Generate docs/report.md from docs/missing-source.md before wrap-up.";
        assert!(token_is_forward_reference(text, "docs/report.md"));
        assert!(!token_is_forward_reference(text, "docs/missing-source.md"));
    }

    #[test]
    fn dependency_preposition_using_flags_source_as_input() {
        let text = "Publish docs/out.md using docs/lib-missing.md and the template.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(!token_is_forward_reference(text, "docs/lib-missing.md"));
    }

    #[test]
    fn dependency_preposition_based_on_flags_source_as_input() {
        let text = "Create docs/new.md based on docs/reference-missing.md.";
        assert!(token_is_forward_reference(text, "docs/new.md"));
        assert!(!token_is_forward_reference(
            text,
            "docs/reference-missing.md"
        ));
    }

    #[test]
    fn input_verb_after_create_verb_wins_for_later_token() {
        // Single clause, no separator between the two verbs: the input
        // verb (`reviewing`) sits between the create verb and the input
        // token. Token-local verb scanning picks the closer verb.
        let text = "Generate docs/out.md after reviewing docs/missing-source.md.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(!token_is_forward_reference(text, "docs/missing-source.md"));
    }

    #[test]
    fn input_verb_after_create_verb_wins_without_preposition() {
        let text = "Publish docs/out.md once you read docs/input-missing.md.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(!token_is_forward_reference(text, "docs/input-missing.md"));
    }

    #[test]
    fn target_location_in_preposition_inherits_create_verb() {
        // `in <path>` most often names a target location under a create
        // verb (`Create docs/out.md in docs/target-dir/`). The path
        // should be treated as a forward-reference so a clean resume
        // does not spuriously flag the target directory as drift.
        let text = "Create docs/out.md in docs/target-dir/index.md.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(token_is_forward_reference(text, "docs/target-dir/index.md"));
    }

    #[test]
    fn non_ascii_prose_does_not_panic() {
        // Em dashes and other multi-byte characters show up in real
        // handoff prose. The clause walker must advance along UTF-8
        // char boundaries so slicing never lands mid-char.
        let text = "Publish docs/out.md — then read docs/missing-input.md.";
        assert!(token_is_forward_reference(text, "docs/out.md"));
        assert!(!token_is_forward_reference(text, "docs/missing-input.md"));
    }

    #[test]
    fn substring_of_earlier_output_does_not_steal_verb() {
        // `input.md` appears twice: as the suffix of
        // `docs/generated-input.md` (a create output) and as a
        // standalone read dependency. The audit must run against the
        // standalone occurrence so `input.md` stays classified as Input.
        let text = "Generate docs/generated-input.md after reading input.md.";
        assert!(token_is_forward_reference(text, "docs/generated-input.md"));
        assert!(!token_is_forward_reference(text, "input.md"));
    }
}

pub(crate) fn extract_key_files_candidates(text: &str) -> Vec<String> {
    let trimmed = text.trim();
    let mut candidates: Vec<String> = Vec::new();

    if let Some(rest) = trimmed.strip_prefix('`') {
        if let Some(end) = rest.find('`') {
            let path = &rest[..end];
            if !path.is_empty() {
                candidates.push(path.to_owned());
                return candidates;
            }
        }
    }

    let stripped = trimmed.trim_matches(|c: char| matches!(c, '`' | '"' | '\''));
    if !stripped.is_empty() {
        candidates.push(stripped.to_owned());
    }

    if let Some((prefix, _)) = stripped.split_once(" - ") {
        let prefix = prefix.trim();
        if !prefix.is_empty() && prefix != stripped {
            candidates.push(prefix.to_owned());
        }
    }

    candidates
}