# anno development tasks
# Run `just` to see available commands
default:
@just --list
# === Quick Commands ===
# Run fast checks (fmt + clippy + quick tests) - matches pre-push hook
check:
#!/usr/bin/env bash
set -e
just docs-audit
cargo fmt --manifest-path Cargo.toml -p anno -- --check
cargo clippy --manifest-path Cargo.toml --workspace --all-targets --features "eval discourse" -- -D warnings
if command -v cargo-nextest >/dev/null 2>&1; then
cargo nextest run --manifest-path Cargo.toml --profile quick --workspace --features "eval discourse"
else
cargo test --manifest-path Cargo.toml --workspace --lib --features "eval discourse"
fi
# Run fast checks without features (minimal, for quick iteration)
check-minimal:
cargo fmt --manifest-path Cargo.toml -p anno -- --check
cargo clippy --manifest-path Cargo.toml -p anno --all-targets --no-default-features
cargo test --manifest-path Cargo.toml -p anno --no-default-features --lib
# Check compile coverage for feature-gated code paths.
#
# This keeps optional features from silently drifting into non-compiling states.
check-feature-matrix:
#!/usr/bin/env bash
set -euo pipefail
features=(
"semantic-chunking"
"subsume"
"jiff-time"
"llm"
"production"
"bundled-crf-weights"
"bundled-hmm-params"
"burn"
)
for feature in "${features[@]}"; do
echo "==> cargo check -p anno-lib --no-default-features --features ${feature}"
cargo check -p anno-lib --no-default-features --features "${feature}"
done
# Format all code
fmt:
cargo fmt --manifest-path Cargo.toml -p anno
# Check formatting without modifying
fmt-check:
cargo fmt --manifest-path Cargo.toml -p anno -- --check
# Run all unit tests (prefers nextest)
test:
#!/usr/bin/env bash
if command -v cargo-nextest >/dev/null 2>&1; then
cargo nextest run --profile quick --lib --features "eval discourse"
else
cargo test --lib --features "eval discourse"
fi
# Run all tests including integration (prefers nextest)
test-all:
#!/usr/bin/env bash
if command -v cargo-nextest >/dev/null 2>&1; then
cargo nextest run --profile quick --workspace --features "eval discourse"
else
cargo test --features "eval discourse"
fi
# Quick single-test run with filter (e.g., just t test_name)
# Uses nextest with minimal features to avoid full workspace scan
t FILTER:
#!/usr/bin/env bash
if command -v cargo-nextest >/dev/null 2>&1; then
cargo nextest run --profile quick -p anno -E 'test(/{{FILTER}}/)' --no-default-features
else
cargo test -p anno --no-default-features -- '{{FILTER}}'
fi
# Quick single-test run with full features
tf FILTER:
#!/usr/bin/env bash
if command -v cargo-nextest >/dev/null 2>&1; then
cargo nextest run --profile quick -p anno -E 'test(/{{FILTER}}/)' --features "eval discourse"
else
cargo test -p anno --features "eval discourse" -- '{{FILTER}}'
fi
# === Test Profiling (Nextest + Rust Tooling) ===
# Profile tests with nextest timing (recommended)
profile-tests PROFILE="quick" FILTER="":
@./scripts/profile-tests.sh {{PROFILE}} {{FILTER}}
# Profile with Rust native tools (debug symbols, perf/sample)
profile-tests-rust PROFILE="quick":
@./scripts/profile-tests-rust.sh {{PROFILE}}
# Profile quick tests only
profile-quick:
@just profile-tests quick
# Profile CI tests
profile-ci:
@just profile-tests ci
# Profile ML tests (slow, model loading)
profile-ml:
@just profile-tests ml
# Profile specific test filter
profile-filter FILTER:
@just profile-tests quick "{{FILTER}}"
# Quick timing report (no full run, just analyze existing)
profile-timing:
@NEXTEST_EXPERIMENTAL_LIBTEST_JSON=1 cargo nextest run --profile quick --workspace --features "eval discourse" --message-format libtest-json-plus --status-level all
# Show slowest tests from last profile run
profile-slowest:
#!/usr/bin/env bash
set -euo pipefail
if [ -z "$(ls -A target/test-profiles/timing_*.json 2>/dev/null || true)" ]; then
echo "No timing files found. Run 'just profile-tests' first."
exit 1
fi
LATEST="$(ls -t target/test-profiles/timing_*.json | sed -n '1p')"
echo "Analyzing: $LATEST"
if command -v jq >/dev/null 2>&1; then
echo ""
echo "=== Slowest Tests ==="
jq -r '.test_executions[] | select(.duration_secs > 0.1) | "\(.duration_secs | tostring | .[0:6])s \(.test_name)"' \
"$LATEST" | sort -rn
else
echo "Install jq for analysis: brew install jq"
fi
# Analyze test profile with detailed breakdown
profile-analyze FILE="":
@if [ -n "{{FILE}}" ]; then \
uv run -- python scripts/analyze_test_profile.py "{{FILE}}"; \
else \
uv run -- python scripts/analyze_test_profile.py; \
fi
# Install profiling tools
profile-install:
@echo "Installing profiling tools..."
@echo ""
@echo "1. cargo-flamegraph (cross-platform flamegraphs):"
@echo " cargo install flamegraph"
@echo ""
@echo "2. cargo-instruments (macOS only, requires Xcode):"
@echo " cargo install cargo-instruments"
@echo ""
@echo "3. hyperfine (benchmarking):"
@echo " brew install hyperfine"
@echo ""
@echo "4. jq (JSON analysis):"
@echo " brew install jq"
# === CI Simulation ===
# Simulate full CI pipeline locally (fast checks only)
ci: fmt
just docs-audit
cargo fmt --all -- --check
cargo check --workspace --all-targets
cargo clippy --workspace --all-targets --features "eval discourse" -- -D warnings
cargo test --package anno --no-default-features --lib
cargo test --package anno --lib
cargo build --workspace --features "eval discourse"
cargo test --workspace --lib --features "eval discourse"
cargo test --package anno --tests --features "eval discourse"
cargo build --workspace --no-default-features
cargo test --workspace --no-default-features --lib
RUSTDOCFLAGS='-D warnings' cargo doc -p anno -p anno-core -p anno-eval --no-deps --features "eval discourse"
@echo "CI simulation passed"
# Simulate CI with sanity evals (includes small random sample evals)
ci-eval: ci
just eval-sanity
# === Evaluation ===
# Run randomized matrix test (backends x datasets x tasks)
# Strategies: random, ml-only, worst-first, ml-all
# Example: just matrix worst-first 42
matrix strategy="random" seed="" perspective="ner":
#!/usr/bin/env bash
echo "Running randomized matrix test (strategy: {{strategy}})..."
export ANNO_CACHE_DIR="${HOME}/.anno_cache"
export ANNO_EVAL_HISTORY="${HOME}/.anno_cache/eval-results.jsonl"
export ANNO_SAMPLE_STRATEGY={{strategy}}
export ANNO_MATRIX_PERSPECTIVE={{perspective}}
if [ -n "{{seed}}" ]; then export ANNO_CI_SEED={{seed}}; fi
cargo test -p anno-eval --lib --features "eval" test_randomized_matrix_sample -- --nocapture
# Run matrix test with ML backends (requires onnx/candle features)
matrix-ml:
@echo "Running ML-focused matrix test..."
@ANNO_SAMPLE_STRATEGY=ml-only ANNO_ML_IN_MATRIX=1 cargo test -p anno-eval --lib --features "eval onnx" test_randomized_matrix_sample -- --nocapture
# Show backend availability matrix
matrix-backends:
@echo "Deprecated: legacy backend availability matrix was removed with archive cleanup."
# Run evaluation on synthetic data (fast, no downloads)
eval-quick:
# Fast, bounded benchmark run that writes an artifact under ./reports/.
# Note: this may still download datasets/models unless caches are already warm.
mkdir -p reports
ANNO_CACHE_DIR="${HOME}/.anno_cache" \
ANNO_EVAL_HISTORY="${HOME}/.anno_cache/eval-results.jsonl" \
cargo run --release -p anno-cli --bin anno --features "eval onnx" -- benchmark \
--tasks ner \
--datasets CoNLL2003Sample,WikiGold,Wnut17,WikiANN,MasakhaNER \
--backends heuristic,stacked,bert_onnx,gliner_onnx \
--max-examples 20 \
--output reports/eval-quick-report.md
# Wider local eval (still bounded, but broader than eval-quick)
eval-wide MAX_EXAMPLES="50":
mkdir -p reports
ANNO_CACHE_DIR="${HOME}/.anno_cache" \
ANNO_EVAL_HISTORY="${HOME}/.anno_cache/eval-results.jsonl" \
cargo run --release -p anno-cli --bin anno --features "eval onnx" -- benchmark \
--tasks ner \
--datasets CoNLL2003Sample,WikiGold,Wnut17,WikiANN,MasakhaNER,MultiCoNERv2,MultiNERD \
--backends heuristic,stacked,bert_onnx,gliner_onnx,nuner \
--max-examples {{MAX_EXAMPLES}} \
--output reports/eval-wide-report.md
# Run sanity check evaluations (small random samples)
# Used in CI on push
eval-sanity:
./scripts/eval-sanity.sh
# Regenerate dataset registry exports and derived tooling files.
regenerate-datasets:
#!/usr/bin/env bash
set -euo pipefail
cargo test -p anno-eval generate_datasets_json -- --ignored
cargo test -p anno-eval generate_datasets_jsonl -- --ignored
cargo test -p anno-eval generate_datasets_markdown -- --ignored
python3 scripts/generate_download_configs.py \
--input generated/datasets_generated.json \
--output generated/download_configs_generated.json \
--stats
python3 scripts/generate_catalog_html.py
# Curated benchmark profiles (official; avoids parsing markdown and avoids “sweep everything”).
# Examples:
# - just eval-profile ner-standard 20
# - just eval-profile ner-zeroshot-multilingual 20
eval-profile PROFILE MAX_EXAMPLES="20":
mkdir -p reports
cargo run --release -p anno-cli --bin anno --features "eval onnx" -- benchmark \
--profile {{PROFILE}} \
--max-examples {{MAX_EXAMPLES}} \
--output reports/eval-{{PROFILE}}.md \
--output-json reports/eval-{{PROFILE}}.json
# Run full evaluations (all task-dataset-backend combinations)
# Heavy operation - only run on eval-* branches or manual trigger
eval-full:
./scripts/eval-full.sh
# Run full evaluations with example limit
eval-full-limit MAX_EXAMPLES:
MAX_EXAMPLES={{MAX_EXAMPLES}} ./scripts/eval-full.sh
# Run evaluation with specific seed
eval-seed SEED MAX_EXAMPLES="20":
cargo run --release -p anno-cli --bin anno --features "eval onnx" -- benchmark \
--max-examples {{MAX_EXAMPLES}} \
--seed {{SEED}} \
--cached-only \
--output eval-seed-{{SEED}}.md
# Run abstract anaphora evaluation
eval-anaphora:
cargo run --example abstract_anaphora_eval --features discourse
# Run comprehensive local evaluation (resumable)
# Example: just eval-comprehensive 50
eval-comprehensive MAX_EXAMPLES="50":
uv run scripts/eval_comprehensive.py --max-examples {{MAX_EXAMPLES}}
# Resume comprehensive evaluation from where it left off
eval-resume MAX_EXAMPLES="50":
uv run scripts/eval_comprehensive.py --resume --max-examples {{MAX_EXAMPLES}}
# View current evaluation results
eval-results:
@cat reports/RESULTS.md 2>/dev/null || echo "No results yet. Run 'just eval-comprehensive' first."
# === Backend Tests ===
# Test ONNX backend (build only, no models)
test-onnx:
cargo build --features onnx
cargo test --lib --features onnx
# Test Candle backend (build only, no models)
test-candle:
cargo build --features candle
cargo test --lib --features candle
# Test with model downloads (slow, requires network)
test-models:
cargo test --features onnx -- --ignored --nocapture
# === Documentation ===
# Build docs
docs:
cargo doc -p anno -p anno-core -p anno-eval --no-deps --features "eval discourse"
# Open docs in browser
docs-open:
cargo doc -p anno -p anno-core -p anno-eval --no-deps --features "eval discourse" --open
# Check internal docs markdown links (fast, no network).
docs-links:
@if command -v uv > /dev/null; then \
uv run -- python scripts/check_docs_links.py; \
else \
python3 scripts/check_docs_links.py; \
fi
# Docs hygiene (links + stale path checks). Fast, offline.
docs-audit:
@if command -v uv > /dev/null; then \
uv run -- python scripts/docs_audit.py; \
else \
python3 scripts/docs_audit.py; \
fi
# Preview README in browser with GitHub-style rendering (auto-reloads)
# Auto-finds free port starting from 8000
readme-preview:
@uv run scripts/serve_readme.py > /tmp/serve_readme.log 2>&1 & \
sleep 3 && \
PORT=$$(cat /tmp/serve_readme_port.txt 2>/dev/null || echo "8000") && \
open http://localhost:$$PORT/README_github_style.html && \
echo "ok: Preview at http://localhost:$$PORT/README_github_style.html (auto-reloads)"
# Run e2e test with Playwright + Gemini VLM
readme-test:
@uv run scripts/e2e_readme_test.py
# Type-check Python scripts with ty (optional).
# Notes:
# - Uses `uvx` so you don't have to install ty into your repo venv.
# - Runs on `scripts/` only (this repo is not a Python package).
typecheck-python:
@which uvx > /dev/null || (echo "Install uv (provides uvx): https://docs.astral.sh/uv/" && exit 1)
uvx ty check scripts
# === Benchmarks ===
# Run NER benchmark (no execution, just compile)
bench-check:
cargo bench -p anno --no-run
# Run benchmarks
bench:
cargo bench -p anno
# === Utilities ===
# Download evaluation datasets
download-datasets:
@echo "Deprecated: legacy real_datasets test was removed/split. Use the CLI to warm caches instead."
@echo "Example: just eval-quick (writes reports/ and will download datasets/models unless cached)."
# Clean build artifacts
clean:
cargo clean
# Check MSRV 1.85 (matches workspace and CI). Requires locked Cargo.lock (ort pinned to rc.10).
msrv:
cargo +1.85.0 check --workspace --locked
# Run property tests with more cases
proptest:
PROPTEST_CASES=1000 cargo test --lib --features "eval" -- proptest
# Warm local dataset cache (and optionally S3 mirror).
# Example:
# ANNO_WARM_PER_TASK=2 ANNO_WARM_SEED=42 cargo run --example cache_warm --features "eval"
cache-warm:
cargo run -p anno --example cache_warm --features "eval"
# === Release ===
# Build release binary
build-release:
cargo build --release -p anno-cli --bin anno --features "eval discourse onnx"
# Run clippy with stricter lints
clippy-strict:
cargo clippy --all-targets -- -W clippy::pedantic -W clippy::nursery
# === Code Quality ===
# Count lines of code
loc:
@tokei src/ tests/ examples/ benches/ --compact
# Check for TODO/FIXME comments
todos:
@rg -i "(TODO|FIXME|HACK|XXX)" --type rust -c | sort -t: -k2 -rn
# Show test coverage summary
test-count:
@echo "Tests:" && rg "^#\[test\]" --type rust -c | awk -F: '{sum += $2} END {print sum}'
# === Quick Examples ===
# Run quickstart example (no deps)
example-minimal:
cargo run -p anno --example minimal
# Run deterministic offline muxer decision-loop example
example-muxer:
cargo run -p anno-eval --example muxer_decision_loop --features eval
# === Mutation Testing ===
# Run mutation tests on entity.rs (fast, targeted)
mutants-fast:
cargo mutants --file "src/entity.rs" --timeout 120 --minimum-test-timeout 60 --features "eval"
# Run mutation tests on specific file
mutants-file FILE:
cargo mutants --file "{{FILE}}" --timeout 30 --minimum-test-timeout 20
# Run mutation tests on all source files (slow, comprehensive)
mutants-all:
cargo mutants --timeout 60 --minimum-test-timeout 30
# List mutants without running tests (quick check)
mutants-list:
cargo mutants --list
# === Static Analysis Tools ===
# Run cargo-deny (dependency linting)
# Rule validation and reporting
validate-rules:
@echo "Validating OpenGrep rules against known patterns..."
@./scripts/validate-rules.sh
unified-report:
@echo "Generating unified static analysis report..."
@./scripts/generate-unified-report.sh
@echo "Report generated: unified-static-analysis-report.md"
failure-summary:
@echo "Summarizing static analysis failures..."
@./scripts/summarize-failures.sh
@echo "Summary generated: static-analysis-failures-summary.md"
# Static analysis tools
deny:
@which cargo-deny > /dev/null || (echo "Install: cargo install --locked cargo-deny" && exit 1)
cargo deny check
# Run cargo-machete (fast unused dependencies)
machete:
@which cargo-machete > /dev/null || (echo "Install: cargo install cargo-machete" && exit 1)
./scripts/static-analysis-common.sh machete
# Run cargo-geiger (unsafe code statistics)
geiger:
@which cargo-geiger > /dev/null || (echo "Install: cargo install cargo-geiger" && exit 1)
cargo geiger
# Run ast-grep checks (optional, local)
ast-grep-unicode:
@which ast-grep > /dev/null || (echo "Install: https://ast-grep.github.io/ (or: brew install ast-grep)" && exit 1)
ast-grep scan --rule .opengrep/rules/rust-unicode-offsets.yaml --report-style short crates/anno/
ast-grep-unicode-all:
@which ast-grep > /dev/null || (echo "Install: https://ast-grep.github.io/ (or: brew install ast-grep)" && exit 1)
ast-grep scan --rule .opengrep/rules/rust-unicode-offsets.yaml --report-style short crates/anno/ tests/ examples/
ast-grep-metal:
@which ast-grep > /dev/null || (echo "Install: https://ast-grep.github.io/ (or: brew install ast-grep)" && exit 1)
ast-grep scan --rule .opengrep/rules/rust-candle-metal.yaml --report-style short crates/anno/
ast-grep-metal-all:
@which ast-grep > /dev/null || (echo "Install: https://ast-grep.github.io/ (or: brew install ast-grep)" && exit 1)
ast-grep scan --rule .opengrep/rules/rust-candle-metal.yaml --report-style short crates/anno/ tests/ examples/
# Generate unsafe code safety report (creative use of cargo-geiger)
safety-report:
@which cargo-geiger > /dev/null || (echo "Install: cargo install cargo-geiger" && exit 1)
@echo "Generating safety report..."
@cargo geiger --output-format json > .safety-report.json 2>/dev/null || true
@echo "Unsafe code statistics:"
@cat .safety-report.json | jq -r '.packages[] | select(.geiger.unsafe_used > 0) | "\(.name): \(.geiger.unsafe_used) unsafe uses"' 2>/dev/null || echo "No unsafe code found or jq not installed"
@echo ""
@echo "Full report saved to .safety-report.json"
# Run OpenGrep static analysis
opengrep:
@which opengrep > /dev/null || (echo "Install: curl -fsSL https://raw.githubusercontent.com/opengrep/opengrep/main/install.sh | bash" && exit 1)
opengrep scan --config auto --json --output opengrep-results.json crates/anno/ tests/ examples/
@echo "Results saved to opengrep-results.json"
@if command -v jq > /dev/null; then \
echo "Found $$(jq -r '.results | length' opengrep-results.json) issues"; \
else \
echo "Install jq to summarize: opengrep-results.json"; \
fi
# Run OpenGrep with custom rules
opengrep-custom:
@which opengrep > /dev/null || (echo "Install: curl -fsSL https://raw.githubusercontent.com/opengrep/opengrep/main/install.sh | bash" && exit 1)
opengrep scan -f .opengrep/rules/rust-security.yaml --json --output opengrep-security-results.json crates/anno/
opengrep scan -f .opengrep/rules/rust-nlp-ml-patterns.yaml --json --output opengrep-nlp-results.json crates/anno/
opengrep scan -f .opengrep/rules/rust-evaluation-framework.yaml --json --output opengrep-eval-results.json crates/anno-eval/
opengrep scan -f .opengrep/rules/rust-anno-specific.yaml --json --output opengrep-anno-results.json crates/anno/
opengrep scan -f .opengrep/rules/rust-error-handling.yaml --json --output opengrep-error-results.json crates/anno/
opengrep scan -f .opengrep/rules/rust-memory-patterns.yaml --json --output opengrep-memory-results.json crates/anno/
@echo "Custom rules results saved to opengrep-*-results.json"
@if command -v jq > /dev/null; then \
echo "Counts:"; \
echo " security: $$(jq -r '.results | length' opengrep-security-results.json)"; \
echo " nlp: $$(jq -r '.results | length' opengrep-nlp-results.json)"; \
echo " eval: $$(jq -r '.results | length' opengrep-eval-results.json)"; \
echo " anno: $$(jq -r '.results | length' opengrep-anno-results.json)"; \
echo " error: $$(jq -r '.results | length' opengrep-error-results.json)"; \
echo " memory: $$(jq -r '.results | length' opengrep-memory-results.json)"; \
fi
# Run Miri on unsafe code files (selective)
miri-unsafe:
@rustup component list | rg -q "miri.*installed" || (echo "Install: rustup component add miri" && exit 1)
@echo "Running Miri on unsafe code files..."
@cargo miri test --lib --features onnx -- --test-threads=1 --nocapture || true
@echo "Miri check complete (see output above)"
# Run all static analysis tools (comprehensive check)
static-analysis:
@echo "=== Running Static Analysis Tools ==="
@echo ""
@echo "1. cargo-deny (dependency linting)..."
@just deny || echo "warning: cargo-deny failed or not installed"
@echo ""
@echo "2. cargo-machete (unused dependencies)..."
@just machete || echo "warning: cargo-machete failed or not installed"
@echo ""
@echo "3. cargo-geiger (unsafe code stats)..."
@just geiger || echo "warning: cargo-geiger failed or not installed"
@echo ""
@echo "4. OpenGrep (security patterns)..."
@just opengrep || echo "warning: OpenGrep failed or not installed"
@echo ""
@echo "=== Static Analysis Complete ==="
# Run tests with cargo-nextest (better output)
test-nextest:
@which cargo-nextest > /dev/null || (echo "Install: cargo install cargo-nextest" && exit 1)
cargo nextest run --all-features
# Generate code coverage report
coverage:
@which cargo-llvm-cov > /dev/null || (echo "Install: cargo install cargo-llvm-cov" && exit 1)
cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
@echo "Coverage report generated: lcov.info"
@echo "View with: genhtml lcov.info -o coverage-html && open coverage-html/index.html"
# Generate comprehensive safety report (creative: combines multiple tools)
safety-report-full:
@./scripts/generate-safety-report.sh
@echo "Full safety report: safety-report.md"
# Benchmark static analysis tools (creative: performance comparison)
benchmark-tools:
@./scripts/benchmark-static-analysis.sh
# Compare tool outputs (creative: identify overlapping findings)
compare-tools:
@./scripts/compare-tool-outputs.sh
# Track unsafe code trends over time (creative: time-series analysis)
track-unsafe-trends:
@./scripts/track-unsafe-code-trends.sh
# Validate static analysis setup
validate-setup:
@./scripts/validate-static-analysis-setup.sh
# === All-in-One Commands ===
# Run everything: static analysis + safety report + trends
analysis-full:
@echo "Running comprehensive static analysis..."
@just static-analysis
@echo ""
@echo "Generating safety report..."
@just safety-report-full
@echo ""
@echo "Tracking unsafe code trends..."
@just track-unsafe-trends
@echo ""
@echo "ok: Comprehensive analysis complete!"
@echo " - Reports: safety-report.md, tool-comparison.md"
@echo " - Trends: .unsafe-code-trends/"
# === Git Hook Checks ===
# Check for invalid filenames (spaces, duplicates)
check-filenames:
@echo "Checking for files with spaces in names..."
@first=$$(find . -type f \( -name "*.rs" -o -name "*.toml" \) -name "* *" ! -path "./.git/*" ! -path "./target/*" ! -path "./.cargo/*" -print -quit 2>/dev/null) && \
if [ -n "$$first" ]; then \
echo "error: Error: Found files with spaces in names (invalid for Rust):"; \
find . -type f \( -name "*.rs" -o -name "*.toml" \) -name "* *" ! -path "./.git/*" ! -path "./target/*" ! -path "./.cargo/*"; \
exit 1; \
fi
@echo "Checking for duplicate test files..."
@dup=$$(find . -type f -name "* 2.rs" ! -path "./.git/*" ! -path "./target/*" -print -quit 2>/dev/null) && \
if [ -n "$$dup" ]; then \
echo "error: Error: Found duplicate test files (likely backups):"; \
find . -type f -name "* 2.rs" ! -path "./.git/*" ! -path "./target/*"; \
echo "Please remove these files before committing."; \
exit 1; \
fi
# Check compilation (fast, catches syntax/type errors)
check-compile:
@echo "Checking compilation..."
@cargo check --workspace --all-targets --message-format=short --quiet
# Check test compilation
check-tests-compile:
@echo "Checking test compilation..."
@cargo check --workspace --tests --message-format=short --quiet
# Check for potential secrets (warn only, non-blocking)
check-secrets:
@echo "Checking for potential secrets..."
@if command -v rg &> /dev/null; then \
files=$$(rg -i --files-with-matches "api[_-]?key\\s*=|password\\s*=|secret\\s*=|token\\s*=|credential\\s*=" \
--glob '!*.md' --glob '!*.txt' --glob '!target/**' \
--glob '!.git/**' --glob '!*.lock' --glob '!justfile' \
--glob '!*.sh' --glob '!docs/**' 2>/dev/null || true); \
if [ -n "$$files" ]; then \
echo "warning: Warning: Potential secrets found. Review before committing."; \
echo "warning: (Listing files only; not printing match contents.)"; \
echo "$$files"; \
fi; \
else \
echo "warning: ripgrep not found, skipping secrets check"; \
fi
# Check for large files (warn only, non-blocking)
check-large-files:
@echo "Checking for large files..."
@first=$$(find . -type f -size +1M ! -path "./target/*" ! -path "./.git/*" ! -path "./.cargo/*" ! -path "./*.lock" ! -path "./assets/*" ! -path "./.mypy_cache/*" ! -path "./.pytest_cache/*" ! -path "./__pycache__/*" -print -quit 2>/dev/null) && \
if [ -n "$$first" ]; then \
echo "warning: Warning: Large files detected (>1MB):"; \
find . -type f -size +1M ! -path "./target/*" ! -path "./.git/*" ! -path "./.cargo/*" ! -path "./*.lock" ! -path "./assets/*" ! -path "./.mypy_cache/*" ! -path "./.pytest_cache/*" ! -path "./__pycache__/*" 2>/dev/null; \
echo " (This is a warning, not blocking)"; \
fi
# Run clippy with warnings only (non-blocking)
check-clippy-warn:
@echo "Running clippy (warnings only)..."
@cargo clippy --workspace --all-targets --quiet 2>&1 || echo "warning: Clippy found warnings (not blocking)"
# Full pre-commit checks (all blocking checks)
pre-commit-full:
@just check-filenames
@just check-compile
@just check-tests-compile
@just check-feature-matrix
@just fmt-check
@echo "ok: Pre-commit checks passed!"
# Pre-commit with warnings (blocking + non-blocking)
pre-commit-all: pre-commit-full
@just check-secrets
@just check-large-files
@just check-clippy-warn
# Validate commit message format
validate-commit-msg COMMIT_MSG:
@len=$$(printf "%s" "{{COMMIT_MSG}}" | wc -c | tr -d ' ') && \
if [ "$$len" -lt 10 ]; then \
echo "error: Error: Commit message too short (minimum 10 characters)"; \
echo " Current message: '{{COMMIT_MSG}}'"; \
exit 1; \
fi
@if printf "%s" "{{COMMIT_MSG}}" | rg -q "^[a-z]+(\\(.+\\))?: .{10,}"; then \
exit 0; \
fi
@if printf "%s" "{{COMMIT_MSG}}" | rg -q "^(Merge|Revert|Release|chore\\(release\\))"; then \
exit 0; \
fi
@echo "warning: Warning: Commit message doesn't follow conventional format"
@echo " Recommended: type(scope): description"
@echo " Examples:"
@echo " - feat(api): add new endpoint"
@echo " - fix: resolve compilation error"
@echo " - docs: update README"
@echo " Current message: '{{COMMIT_MSG}}'"
@echo ""
@echo " (This is a warning, commit will proceed)"
# Quick validation before commit (legacy, kept for compatibility)
pre-commit-check:
@echo "Running pre-commit checks..."
@cargo fmt --all -- --check
@cargo clippy --workspace --all-targets --features "eval discourse" -- -D warnings
@just machete || echo "warning: cargo-machete not installed, skipping"
@echo "ok: Pre-commit checks passed"
# === Git Hook Setup ===
# Install git hooks (run once after clone)
setup-hooks:
@echo "Installing git hooks from scripts/hooks/..."
@cp scripts/hooks/pre-commit .git/hooks/pre-commit
@cp scripts/hooks/pre-push .git/hooks/pre-push
@cp scripts/hooks/commit-msg .git/hooks/commit-msg
@chmod +x .git/hooks/pre-commit
@chmod +x .git/hooks/pre-push
@chmod +x .git/hooks/commit-msg
@echo ""
@echo "Hooks installed:"
@echo " pre-commit fast checks (format, compile)"
@echo " pre-push full checks (clippy, tests)"
@echo " commit-msg message format hints"
@echo ""
@echo "To bypass: git commit --no-verify"
# Show hook status
hook-status:
@echo "Git hook status:"
@ls -la .git/hooks/pre-commit .git/hooks/pre-push .git/hooks/commit-msg 2>/dev/null || echo "No hooks installed"
# Run what pre-commit hook runs (for debugging)
run-pre-commit-hook:
@echo "Simulating pre-commit hook..."
@cargo fmt --all
@cargo check --workspace --all-targets --quiet
# Run what pre-push hook runs (for debugging)
run-pre-push-hook:
@echo "Simulating pre-push hook..."
@cargo fmt --all -- --check
@cargo clippy --workspace --all-targets --features "eval discourse" -- -D warnings
@cargo test --workspace --lib --features "eval discourse" --quiet
@cargo test --workspace --doc --features "eval discourse" --quiet || echo "warning: Doc test warnings"
# Generate HTML dashboard (creative: visual analysis results)
dashboard:
@./scripts/generate-analysis-dashboard.sh
@echo "Dashboard: static-analysis-dashboard.html"
# === NLP/ML-Specific Analysis ===
# Check NLP/ML-specific patterns
check-nlp-patterns:
@./scripts/check-nlp-patterns.sh
# Analyze evaluation framework patterns
analyze-eval-patterns:
@./scripts/analyze-evaluation-patterns.sh
@echo "Analysis: evaluation-pattern-analysis.md"
# Check ML backend patterns
check-ml-backends:
@./scripts/check-ml-backend-patterns.sh
# Check evaluation framework invariants
check-eval-invariants:
@./scripts/check-evaluation-invariants.sh
# Comprehensive NLP/ML analysis (combines all checks)
analysis-nlp-ml:
@echo "=== NLP/ML Pattern Analysis ==="
@just check-nlp-patterns || echo "warning: Some NLP pattern issues found"
@echo ""
@echo "=== Evaluation Framework Analysis ==="
@just analyze-eval-patterns
@echo ""
@echo "=== ML Backend Analysis ==="
@just check-ml-backends || echo "warning: Some ML backend issues found"
@echo ""
@echo "=== Evaluation Invariants ==="
@just check-eval-invariants || echo "warning: Some invariant issues found"
@echo ""
@echo "=== OpenGrep Custom Rules ==="
@just opengrep-custom || echo "warning: OpenGrep not installed"
@echo ""
@echo "ok: NLP/ML analysis complete"
# Generate repo-specific analysis report
repo-analysis:
@./scripts/generate-repo-specific-report.sh
@echo "Report: repo-specific-analysis.md"
# Integrate static analysis with evaluation framework
integrate-analysis-eval:
@./scripts/integrate-with-evaluation.sh
@echo "Integration guide: static-analysis-eval-integration.md"
# Check for historical bug patterns (regression prevention)
check-historical-bugs:
@./scripts/check-historical-bugs.sh
# === Publish Validation ===
# Validate publish readiness for all crates
validate-publish:
@./scripts/validate-publish.sh
# === AWS Spot Instance Evaluation ===
# One-time spot infrastructure setup (IAM, SQS, EBS, launch template)
spot-setup:
@chmod +x scripts/spot/setup.sh
@./scripts/spot/setup.sh
# Pre-download datasets to S3 (avoids HuggingFace API rate limits on spot instances)
# Run this before spot-eval to ensure datasets are available in S3
spot-prepare-datasets:
@uv run scripts/prepare_datasets_s3.py
# Run comprehensive evaluation on spot instances (full pipeline)
# Generates tasks, launches fleet, waits for completion, aggregates results
spot-eval:
@uv run scripts/spot/orchestrate.py full
# Run spot eval with custom parameters
# Example: just spot-eval-custom "gliner,nuner" "WikiGold,Wnut17" 2
spot-eval-custom BACKENDS DATASETS FLEET_SIZE="4":
@uv run scripts/spot/orchestrate.py full \
--backends "{{BACKENDS}}" \
--datasets "{{DATASETS}}" \
--fleet-size "{{FLEET_SIZE}}"
# Generate evaluation tasks and enqueue (without launching fleet)
spot-generate:
@uv run scripts/spot/orchestrate.py generate
# Generate tasks for specific backends/datasets
spot-generate-custom BACKENDS="" DATASETS="" SEEDS="42,123,456":
@uv run scripts/spot/orchestrate.py generate \
--backends "{{BACKENDS}}" \
--datasets "{{DATASETS}}" \
--seeds "{{SEEDS}}"
# Preview tasks without enqueueing
spot-generate-dry:
@uv run scripts/spot/orchestrate.py generate --dry-run
# Launch spot fleet (requires tasks in queue)
spot-launch FLEET_SIZE="4":
@uv run scripts/spot/orchestrate.py launch --fleet-size "{{FLEET_SIZE}}"
# Check evaluation progress (queue depth, fleet status, results count)
spot-status:
@uv run scripts/spot/orchestrate.py status
# Monitor workers via SSM (no SSH required)
spot-monitor:
@uv run scripts/spot/monitor.py
# Monitor workers with live updates
spot-monitor-watch:
@uv run scripts/spot/monitor.py --watch
# Tail logs from a specific worker
spot-logs INSTANCE:
@uv run scripts/spot/monitor.py --logs "{{INSTANCE}}" --follow
# Execute command on a worker via SSM
spot-exec INSTANCE CMD:
@uv run scripts/spot/monitor.py --exec "{{INSTANCE}}" "{{CMD}}"
# Aggregate and display results from S3
spot-results OUTPUT="reports/spot-eval-results.json":
@uv run scripts/spot/orchestrate.py results --output "{{OUTPUT}}"
# Download results from S3 and aggregate
spot-aggregate:
@uv run scripts/spot/aggregate.py --download
# Regenerate summary and open in browser
spot-summary:
@uv run scripts/spot/aggregate.py --open
# Show LLM-generated summary of results
spot-summarize:
@uv run scripts/spot/aggregate.py --llm
# Cancel fleet and clean up
spot-teardown:
@uv run scripts/spot/orchestrate.py teardown
# Cancel fleet and purge task queue
spot-teardown-full:
@uv run scripts/spot/orchestrate.py teardown --purge-queue
## Note: runctl integration removed.
##
## This repo previously supported runctl-managed spot evals. That path is intentionally
## deleted now; keep orchestration under `scripts/spot/orchestrate.py` (SQS/SSM-based).
# Quick spot eval (3 fast backends, 2 datasets, 1 seed) - good for testing
spot-eval-quick:
@uv run scripts/spot/orchestrate.py full \
--backends "pattern,heuristic,stacked" \
--datasets "WikiGold,Wnut17" \
--seeds "42" \
--fleet-size 1
# Local evaluation (no AWS, runs on this machine)
eval-local BACKENDS="heuristic,stacked" DATASETS="WikiGold" MAX="50":
@cargo build --release -p anno-cli --features "eval onnx" > /dev/null
@uv run scripts/spot/orchestrate.py local \
--backends "{{BACKENDS}}" \
--datasets "{{DATASETS}}" \
--max-examples "{{MAX}}"
# Local quick eval (zero-dep backends only, fast)
eval-local-quick:
@cargo build --release -p anno-cli --features "eval onnx" > /dev/null
@uv run scripts/spot/orchestrate.py local \
--profile quick \
--datasets "WikiGold,CoNLL2003Sample" \
--max-examples 30
# ML-focused spot eval (ONNX/Candle backends only)
spot-eval-ml:
@uv run scripts/spot/orchestrate.py full \
--backends "gliner,nuner,w2ner,gliner2,bert_onnx,gliner_candle" \
--fleet-size 4
# Sync local dataset/model cache to S3 (for spot instances)
spot-cache-upload:
@./scripts/sync_datasets_s3.sh upload
# Download cached datasets/models from S3
spot-cache-download:
@./scripts/sync_datasets_s3.sh download
# Show S3 cache status
spot-cache-status:
@./scripts/sync_datasets_s3.sh status
# Upload current source code to S3 (required before launching spot instances)
spot-upload-src:
@git archive --format=tar.gz HEAD -o /tmp/anno-src.tar.gz
@aws s3 cp /tmp/anno-src.tar.gz s3://arc-anno-data/src/anno-src.tar.gz
@echo "Source uploaded to s3://arc-anno-data/src/anno-src.tar.gz"
# Run CI-style muxer-backed sampler locally.
# Uses ~/.anno_cache to match the CI actions/cache path for both muxer state and eval history.
# Override with ANNO_HISTORY_FILE, ANNO_EVAL_HISTORY, or ANNO_LINUCB_STATE_FILE for isolation.
ci-matrix-local SEED="42" PERSPECTIVE="ner":
ANNO_CACHE_DIR="${HOME}/.anno_cache" \
ANNO_EVAL_HISTORY="${HOME}/.anno_cache/eval-results.jsonl" \
ANNO_CI_SEED="{{SEED}}" \
ANNO_MATRIX_PERSPECTIVE="{{PERSPECTIVE}}" \
ANNO_SAMPLE_STRATEGY=worst-first \
ANNO_MUXER_PROFILE=fast \
cargo test -p anno-eval --lib --features "eval" matrix_muxer_ci::test_randomized_matrix_sample -- --nocapture
# Legacy: spot “badness history” export is not wired into the muxer JSON format by default.
# Detect regressions in the quality matrix.
# Runs a quick eval then checks the full SQLite history for F1 drops.
check-regressions SEED="42":
ANNO_CACHE_DIR="${HOME}/.anno_cache" \
ANNO_EVAL_HISTORY="${HOME}/.anno_cache/eval-results.jsonl" \
ANNO_SAMPLE_STRATEGY=estimate \
ANNO_ML_IN_MATRIX=1 \
ANNO_CI_SEED="{{SEED}}" \
ANNO_MUXER_VERBOSE=1 \
ANNO_CHECK_REGRESSIONS=1 \
ANNO_MATRIX_PERSPECTIVE=ner \
ANNO_MUXER_BACKENDS_PER_RUN=4 \
ANNO_MUXER_FIXED_DATASETS=WikiGold,Wnut17,MasakhaNER \
cargo test --release -p anno-eval --lib --features "eval onnx" \
matrix_muxer_ci::test_randomized_matrix_sample -- --nocapture
spot-export-badness:
@echo "Not implemented: spot -> muxer history export. Run ci-matrix-local to generate muxer_history.json from local runs."
# Compare spot results against a baseline
spot-compare BASELINE:
@uv run scripts/spot/orchestrate.py results --compare "{{BASELINE}}"
# =============================================================================
# Spot + trainctl Integration (Enhanced Features)
# =============================================================================
# Launch trainctl interactive dashboard for spot monitoring
spot-dash:
@scripts/spot/trainctl-bridge.sh dashboard
# Sync datasets from S3 using trainctl (faster parallel downloads)
spot-sync-datasets:
@scripts/spot/trainctl-bridge.sh sync-datasets
# Download spot results using trainctl
spot-sync-results DIR="reports/spot-results":
@scripts/spot/trainctl-bridge.sh sync-results "{{DIR}}"
# Upload source using trainctl (faster upload)
spot-upload-src-fast:
@scripts/spot/trainctl-bridge.sh upload-src
# Show processes on spot worker (trainctl)
spot-ps INSTANCE="":
@scripts/spot/trainctl-bridge.sh processes "{{INSTANCE}}"
# Interactive top for spot worker (trainctl)
spot-top INSTANCE="":
@scripts/spot/trainctl-bridge.sh top "{{INSTANCE}}"
# Show fleet costs (trainctl)
spot-cost:
@scripts/spot/trainctl-bridge.sh cost
# === trainctl Integration (Alternative) ===
# Launch workers via trainctl (better SSM, monitoring, dashboard)
# Requires: cd ../trainctl && cargo build --release
spot-trainctl WORKERS="1" PROFILE="full":
@./scripts/spot/launch-trainctl.sh "{{WORKERS}}" "{{PROFILE}}"
# Quick trainctl launch (1 worker, quick profile)
spot-trainctl-quick:
@./scripts/spot/launch-trainctl.sh 1 quick
# ML trainctl launch (4 workers, ML backends)
spot-trainctl-ml:
@./scripts/spot/launch-trainctl.sh 4 ml
# Check if trainctl is available
spot-trainctl-check:
@if command -v trainctl &>/dev/null; then \
echo "trainctl: $(which trainctl)"; \
trainctl --version; \
elif [ -f ../trainctl/target/release/trainctl ]; then \
echo "trainctl: ../trainctl/target/release/trainctl (local build)"; \
../trainctl/target/release/trainctl --version; \
else \
echo "trainctl not found. Build it:"; \
echo " cd ../trainctl && cargo build --release"; \
fi