#!/usr/bin/env bash
# contrib/validation/doc-rust-samples.sh — validate Rust samples in Synta docs
#
# Extracts every fenced Rust code block from the documentation, wraps each
# one in a compilable translation unit, and reports failures with the
# originating file and line number.
#
# Delegates all markdown parsing, classification, and source-file wrapping to
# the companion script doc-rust-samples.py, then type-checks each generated
# file by writing it to a temporary Cargo project and running `cargo check`.
#
# Usage:
#   ./contrib/validation/doc-rust-samples.sh [OPTIONS] [FILE.md ...]
#
# If no FILE.md arguments and no --docs-dir are given, every .md file in
# the workspace is processed (build directories and external test repos
# under tests/vectors/ are excluded automatically).
#
# Options:
#   --docs-dir DIR     Search for .md files in DIR instead of the whole workspace
#   --features-serde   Also compile serde_json blocks (requires synta serde feature)
#   --verbose, -v      Print a line for every block, not just failures
#   --help, -h         Show this message and exit
#
# Exit status: 0 if all samples compiled successfully, 1 if any failed.

# Require bash 4+
if [ "${BASH_VERSINFO:-0}" -lt 4 ]; then
    echo "error: bash 4 or later is required (you have ${BASH_VERSION:-unknown})" >&2
    exit 1
fi

set -euo pipefail

# ── Colour helpers (honour NO_COLOR=1) ──────────────────────────────────────
if [[ "${NO_COLOR:-}" == "1" ]]; then
    RED=''; GREEN=''; YELLOW=''; CYAN=''; BOLD=''; NC=''
else
    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
    CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
fi

# ── Locate repo root ─────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

# ── Defaults ─────────────────────────────────────────────────────────────────
DOCS_DIR=""   # empty = workspace-wide scan; set via --docs-dir
FEATURES_SERDE=0
VERBOSE=0
EXTRA_MD_FILES=()

# ── Helpers ──────────────────────────────────────────────────────────────────
die()  { printf "${RED}error:${NC} %s\n" "$*" >&2; exit 1; }
info() { printf "${CYAN}%s${NC}\n" "$*"; }

usage() {
    grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \{0,1\}//'
}

# ── Argument parsing ─────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
    case "$1" in
        --docs-dir)       DOCS_DIR="$2";     shift 2 ;;
        --features-serde) FEATURES_SERDE=1;  shift   ;;
        --verbose|-v)     VERBOSE=1;         shift   ;;
        --help|-h)        usage; exit 0      ;;
        *.md)             EXTRA_MD_FILES+=("$1"); shift ;;
        *)                die "Unknown option: $1" ;;
    esac
done

# ── Preflight checks ─────────────────────────────────────────────────────────
command -v python3 &>/dev/null || die "python3 is required but not found"
command -v cargo   &>/dev/null || die "cargo is required but not found"

PYTHON_SCRIPT="$SCRIPT_DIR/doc-rust-samples.py"
[[ -f "$PYTHON_SCRIPT" ]] ||
    die "doc-rust-samples.py not found next to this script (expected: $PYTHON_SCRIPT)"

# ── Temp workspace ───────────────────────────────────────────────────────────
WORK_DIR="$(mktemp -d)"
trap 'rm -rf "$WORK_DIR"' EXIT

# ── Collect markdown files ───────────────────────────────────────────────────
MD_FILES=()
if [[ ${#EXTRA_MD_FILES[@]} -gt 0 ]]; then
    # Explicit files given on the command line — use exactly those.
    MD_FILES=("${EXTRA_MD_FILES[@]}")
elif [[ -n "$DOCS_DIR" ]]; then
    # --docs-dir was given — restrict search to that directory.
    while IFS= read -r f; do
        MD_FILES+=("$f")
    done < <(find "$DOCS_DIR" -name '*.md' -type f | sort)
else
    # Default: scan all workspace-owned markdown files, excluding build
    # directories and external repositories cloned into tests/vectors/.
    while IFS= read -r f; do
        MD_FILES+=("$f")
    done < <(find "$REPO_ROOT" -name '*.md' -type f \
        ! -path '*/target/*' \
        ! -path '*/.cargo/*' \
        ! -path '*/bench-data/*' \
        ! -path '*/.pytest_cache/*' \
        ! -path '*/.claude/*' \
        ! -path '*/tests/vectors/cryptography/*' \
        ! -path '*/tests/vectors/dilithium-certificates/*' \
        ! -path '*/tests/vectors/kyber-certificates/*' \
        ! -path '*/tests/vectors/mozilla-ca/*' \
        ! -path '*/tests/vectors/ccadb/*' \
        | sort)
fi

[[ ${#MD_FILES[@]} -gt 0 ]] ||
    die "No markdown files found"

# ── Banner ───────────────────────────────────────────────────────────────────
printf '\n'
info "Synta documentation Rust sample validator"
info "========================================="
printf 'Repo root: %s\n'                 "$REPO_ROOT"
printf 'Sources  : %d markdown file(s)\n' "${#MD_FILES[@]}"
if [[ $FEATURES_SERDE -eq 1 ]]; then
    printf 'Serde    : enabled (serde_json blocks will be compiled)\n'
else
    printf 'Serde    : disabled (pass --features-serde to enable)\n'
fi
printf '\n'

# ── Run Python extractor ─────────────────────────────────────────────────────
MANIFEST="$WORK_DIR/manifest.tsv"
PY_SERDE_FLAG=()
[[ $FEATURES_SERDE -eq 1 ]] && PY_SERDE_FLAG=(--features-serde)
PY_OUTPUT=$(python3 "$PYTHON_SCRIPT" "${PY_SERDE_FLAG[@]}" "$WORK_DIR" "${MD_FILES[@]}")
# Python emits "total<TAB>skipped" on stdout.
# ${var%%pattern}  removes the longest suffix match  → keeps everything before the first tab.
# ${var##pattern}  removes the longest prefix match  → keeps everything after the last tab.
TOTAL_BLOCKS="${PY_OUTPUT%%$'\t'*}"
SKIP="${PY_OUTPUT##*$'\t'}"
printf 'Extracted %s Rust code block(s) (%s skipped)\n\n' \
    "$TOTAL_BLOCKS" "$SKIP"

if [[ ! -s "$MANIFEST" ]]; then
    printf "${YELLOW}No compilable Rust code blocks found in the specified files.${NC}\n"
    exit 0
fi

# ── Build the temporary Cargo check crate ───────────────────────────────────
# All snippets are type-checked by writing them to src/lib.rs of a single
# temporary Cargo project that depends on synta via a path dependency.
# Using the repo's own target directory means synta and its dependencies
# (already compiled by the normal workspace build) are reused across all
# snippet checks, keeping per-snippet overhead to just the re-analysis of
# the tiny lib.rs file.
CRATE_DIR="$WORK_DIR/crate"
mkdir -p "$CRATE_DIR/src"

SYNTA_FEATURES='"std", "derive"'
SERDE_DEP=''
if [[ $FEATURES_SERDE -eq 1 ]]; then
    SYNTA_FEATURES='"std", "derive", "serde"'
    SERDE_DEP='serde_json = "1"'
fi

cat > "$CRATE_DIR/Cargo.toml" <<EOF
[package]
name    = "synta-doc-check"
version = "0.1.0"
edition = "2021"
publish = false

[lib]
path = "src/lib.rs"

[dependencies]
synta = { path = "$REPO_ROOT", features = [$SYNTA_FEATURES] }
synta-certificate = { path = "$REPO_ROOT/synta-certificate", features = ["openssl"] }
synta-x509-verification = { path = "$REPO_ROOT/synta-x509-verification", features = ["openssl"] }
$SERDE_DEP
EOF

# Cargo writes build artefacts to CARGO_TARGET_DIR.  Pointing it at the
# workspace target directory lets cargo reuse the already-compiled synta
# rlib and all transitive dependencies (smallvec, synta-derive, …) without
# rebuilding them from scratch.
export CARGO_TARGET_DIR="$REPO_ROOT/target"

# Touch src/lib.rs once so cargo check does not reject a missing source file
# before we fill it in per-snippet.
touch "$CRATE_DIR/src/lib.rs"

# Run one initial cargo check to build synta and all deps (or confirm they
# are already up to date).  Errors here indicate a problem with synta itself
# rather than any doc snippet.
#
# Note: --offline is intentionally omitted here.  The synta-doc-check crate
# has no Cargo.lock and may include deps (e.g. serde_json) that are dev-
# dependencies of the workspace and therefore not downloaded by 'cargo build'.
# The first check generates Cargo.lock and fetches any missing sources;
# subsequent per-snippet checks run --offline because everything is then
# present in CARGO_TARGET_DIR and the registry cache.
info "Building synta and dependencies (first-time setup may take a moment)…"
if ! cargo check --manifest-path "$CRATE_DIR/Cargo.toml" --quiet; then
    die "Initial cargo check failed — synta may have compilation errors"
fi
printf '\n'

# ── Check helpers ────────────────────────────────────────────────────────────

# check_src SRC_RS ERRFILE
#   Copies SRC_RS to the crate's src/lib.rs and runs cargo check.
#   Compiler output is captured in ERRFILE.
#   Returns the cargo check exit code.
check_src() {
    local src="$1" errfile="$2"
    cp "$src" "$CRATE_DIR/src/lib.rs"
    cargo check --offline --manifest-path "$CRATE_DIR/Cargo.toml" --quiet 2>"$errfile"
}

# show_errors ERRFILE SRC_PATH REL_DOC START_LINE
#   Pretty-prints cargo check errors, replacing the generated file path and
#   crate name with a human-readable reference to the originating doc.
show_errors() {
    local errfile="$1" src_path="$2" rel_doc="$3" start_line="$4"
    local ref="<${rel_doc}:${start_line}>"
    # cargo check error lines reference the crate source path and the
    # internal crate name; replace both with the doc reference.
    sed "s|$CRATE_DIR/src/lib.rs|${ref}|g
         s|synta_doc_check|${ref}|g
         s|^|         |" "$errfile" >&2
}

# ── Compile loop ─────────────────────────────────────────────────────────────
# Manifest columns (tab-separated, written by doc-rust-samples.py):
#   doc_file  start_line  lang  src_file  kind  raw_file  combined_file
#
# Only compilable blocks appear in the manifest; skipped blocks are filtered
# out by doc-rust-samples.py to avoid bash IFS collapsing of empty tab fields.
# combined_file is an empty string when no combined version was generated.

# ── Read manifest into parallel arrays ───────────────────────────────────────
# We need random access by index (to map batch error lines back to snippets),
# so load everything into arrays before touching the compiler.
declare -a M_DOC_FILE=()
declare -a M_START_LINE=()
declare -a M_LANG=()
declare -a M_SRC_FILE=()
declare -a M_KIND=()
declare -a M_COMBINED_FILE=()

while IFS=$'\t' read -r doc_file start_line lang src_file kind raw_file combined_file; do
    M_DOC_FILE+=("$doc_file")
    M_START_LINE+=("$start_line")
    M_LANG+=("$lang")
    M_SRC_FILE+=("$src_file")
    M_KIND+=("$kind")
    M_COMBINED_FILE+=("${combined_file:-}")
done < "$MANIFEST"

N_SNIPPETS=${#M_DOC_FILE[@]}

# ── Build a single batched lib.rs ─────────────────────────────────────────────
# Wrap every snippet in its own named module so all 89 snippets can be
# type-checked in one cargo check invocation instead of N separate ones.
#
# Inner attributes (#![allow(...)]) are valid inside mod blocks — they apply
# to the enclosing module, which is exactly what we want.  Each snippet's
# fn _synta_doc_sample is scoped to its own module so the identical function
# names across snippets don't conflict.
BATCHED_LIB="$CRATE_DIR/src/lib.rs"

# Record the 1-based line number where each "mod snippet_N {" begins in
# BATCHED_LIB so we can map compiler error locations back to snippet indices.
declare -a MOD_START_LINE=()
current_line=1

: > "$BATCHED_LIB"
for i in "${!M_DOC_FILE[@]}"; do
    MOD_START_LINE+=($current_line)
    printf 'mod snippet_%05d {\n' "$i" >> "$BATCHED_LIB"
    cat "${M_SRC_FILE[$i]}"            >> "$BATCHED_LIB"
    printf '}\n\n'                     >> "$BATCHED_LIB"
    src_lines=$(wc -l < "${M_SRC_FILE[$i]}")
    current_line=$((current_line + 1 + src_lines + 2))
done

# ── Batch check ───────────────────────────────────────────────────────────────
PASS=0; FAIL=0
declare -a FAIL_MSGS=()

BATCH_ERR="$WORK_DIR/batch.err"

if cargo check --offline --manifest-path "$CRATE_DIR/Cargo.toml" --quiet 2>"$BATCH_ERR"; then
    # ── All snippets passed in a single cargo check ───────────────────────────
    PASS=$N_SNIPPETS
    if [[ $VERBOSE -eq 1 ]]; then
        for i in "${!M_DOC_FILE[@]}"; do
            rel_doc="${M_DOC_FILE[$i]#"$REPO_ROOT"/}"
            printf "  ${GREEN}OK${NC}    %-60s [%s]\n" \
                "${rel_doc}:${M_START_LINE[$i]}" "${M_KIND[$i]}"
        done
    fi
else
    # ── Identify which snippets have errors ────────────────────────────────────
    # Parse " --> .../lib.rs:LINE:COL" locations from the batch error output
    # and map each line number back to the snippet whose module block contains
    # it.  Module blocks are contiguous and in index order, so a linear scan
    # finds the enclosing block in O(N) with an early-exit on the first module
    # that starts after the error line.
    declare -A NEEDS_RECHECK=()
    while IFS= read -r errline; do
        if [[ "$errline" =~ --\>[[:space:]].*lib\.rs:([0-9]+): ]]; then
            err_line="${BASH_REMATCH[1]}"
            found=-1
            for j in "${!MOD_START_LINE[@]}"; do
                if [[ "${MOD_START_LINE[$j]}" -le "$err_line" ]]; then
                    found=$j
                else
                    break
                fi
            done
            [[ $found -ge 0 ]] && NEEDS_RECHECK[$found]=1
        fi
    done < "$BATCH_ERR"

    # ── Re-check flagged snippets individually; pass the rest ─────────────────
    for i in "${!M_DOC_FILE[@]}"; do
        rel_doc="${M_DOC_FILE[$i]#"$REPO_ROOT"/}"
        label="${rel_doc}:${M_START_LINE[$i]}"

        if [[ -z "${NEEDS_RECHECK[$i]:-}" ]]; then
            PASS=$((PASS + 1))
            if [[ $VERBOSE -eq 1 ]]; then
                printf "  ${GREEN}OK${NC}    %-60s [%s]\n" "$label" "${M_KIND[$i]}"
            fi
            continue
        fi

        # Individual check for this snippet.
        src_file="${M_SRC_FILE[$i]}"
        err_file="${src_file%.rs}.err"
        if check_src "$src_file" "$err_file" 2>/dev/null; then
            PASS=$((PASS + 1))
            if [[ $VERBOSE -eq 1 ]]; then
                printf "  ${GREEN}OK${NC}    %-60s [%s]\n" "$label" "${M_KIND[$i]}"
            fi
            continue
        fi

        # Retry with combined context (all preceding toplevel blocks in same file).
        combined_file="${M_COMBINED_FILE[$i]}"
        if [[ -n "$combined_file" && -f "$combined_file" ]]; then
            comb_err="${combined_file%.rs}.err"
            if check_src "$combined_file" "$comb_err" 2>/dev/null; then
                PASS=$((PASS + 1))
                if [[ $VERBOSE -eq 1 ]]; then
                    printf "  ${GREEN}OK${NC}    %-60s [%s, with context]\n" "$label" "${M_KIND[$i]}"
                fi
                continue
            fi
            err_file="$comb_err"
            src_file="$combined_file"
        fi

        FAIL=$((FAIL + 1))
        FAIL_MSGS+=("$label")
        printf "\n${RED}[FAIL]${NC} %s\n"    "$label"
        printf   "       Language : %s\n"    "${M_LANG[$i]}"
        printf   "       Kind     : %s\n"    "${M_KIND[$i]}"
        printf   "       Errors:\n"
        show_errors "$err_file" "$src_file" "$rel_doc" "${M_START_LINE[$i]}"
    done
fi

# ── Summary ──────────────────────────────────────────────────────────────────
printf '\n'
printf -- '──────────────────────────────────────────────────────────\n'
printf 'Results: '
printf "${GREEN}%d passed${NC}, " "$PASS"
printf "${RED}%d failed${NC}, "   "$FAIL"
printf "${YELLOW}%d skipped${NC}\n" "$SKIP"   # $SKIP set by Python extractor

if [[ $FAIL -gt 0 ]]; then
    printf '\nFailed samples:\n'
    for msg in "${FAIL_MSGS[@]}"; do
        printf "  ${RED}%s${NC}\n" "$msg"
    done
    printf '\n'
    exit 1
fi

printf "\n${GREEN}${BOLD}All samples compiled successfully.${NC}\n\n"
exit 0
