name: E2E Smoke
on:
push:
branches: [main]
paths:
- "src/**"
- "tests/e2e/**"
- "Cargo.toml"
- "Cargo.lock"
pull_request:
branches: [main]
paths:
- "src/**"
- "tests/e2e/**"
- "Cargo.toml"
- "Cargo.lock"
workflow_dispatch:
permissions: read-all
env:
CARGO_TERM_COLOR: always
jobs:
build-tj:
name: Build ThoughtJack
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd with:
submodules: true
- uses: dtolnay/rust-toolchain@e814c742d4444ce2f3f6abddea7faf00161ed941 - uses: Swatinem/rust-cache@23869a5bd66c73db3c0ac40331f3206eb23791dc - name: Build release binary
run: cargo build --release
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f with:
name: thoughtjack-binary
path: target/release/thoughtjack
retention-days: 1
e2e:
name: "${{ matrix.scenario }} × ${{ matrix.framework }}"
runs-on: ubuntu-latest
needs: [build-tj]
strategy:
fail-fast: false
matrix:
include:
- scenario: mcp-tool-invocation
framework: langgraph
self_test: false
- scenario: mcp-tool-invocation
framework: crewai
self_test: false
- scenario: a2a-task-delegation
framework: crewai
self_test: false
- scenario: mcp-client-basic
framework: self-test
self_test: true
- scenario: a2a-client-basic
framework: self-test
self_test: true
- scenario: mcp-resources
framework: self-test
self_test: true
- scenario: mcp-prompts
framework: self-test
self_test: true
- scenario: multi-phase-rug-pull
framework: self-test
self_test: true
- scenario: extractor-cross-actor
framework: self-test
self_test: true
- scenario: cel-indicators
framework: self-test
self_test: true
- scenario: verdict-not-exploited
framework: self-test
self_test: true
- scenario: verdict-all-correlation
framework: self-test
self_test: true
- scenario: a2a-streaming
framework: self-test
self_test: true
- scenario: mcp-instructions
framework: self-test
self_test: true
- scenario: error-graceful-degradation
framework: self-test
self_test: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 with:
name: thoughtjack-binary
path: target/release
- name: Make binary executable
run: chmod +x target/release/thoughtjack
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 with:
python-version: "3.12"
- uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f if: ${{ !matrix.self_test }}
with:
node-version: "22"
- name: Install mock-llm
if: ${{ !matrix.self_test }}
run: npm install -g @dwmkerr/mock-llm@0.1.28
- name: Start mock-llm
if: ${{ !matrix.self_test }}
run: |
mock-llm &
echo "MOCK_LLM_PID=$!" >> "$GITHUB_ENV"
sleep 2
- name: Install agent dependencies
if: ${{ !matrix.self_test }}
run: pip install -e tests/e2e/reference-agents/${{ matrix.framework }}/
- name: Install orchestrator dependencies
run: pip install "pyyaml==6.0.3"
- name: Run e2e scenario
run: |
args=(
--scenario "tests/e2e/fixtures/${{ matrix.scenario }}"
--tj-binary ./target/release/thoughtjack
--base-port 19000
--timeout 30
--output-dir tests/e2e/results
)
if [ "${{ matrix.self_test }}" = "true" ]; then
args+=(--self-test)
else
args+=(
--framework "${{ matrix.framework }}"
--mock-llm-url http://localhost:6556
)
fi
python tests/e2e/run_conformance.py "${args[@]}"
- name: Upload results
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f with:
name: "e2e-${{ matrix.scenario }}-${{ matrix.framework }}"
path: tests/e2e/results/
retention-days: 7
- name: Stop mock-llm
if: always() && !matrix.self_test
run: kill "$MOCK_LLM_PID" 2>/dev/null || true
e2e-summary:
name: E2E Summary
runs-on: ubuntu-latest
needs: [e2e]
if: always()
steps:
- name: Check self-test results
run: |
# Fail the summary if any self-test job failed.
# Framework tests use continue-on-error so their failures are
# visible in the matrix but do not block the overall workflow.
results='${{ toJSON(needs.e2e.result) }}'
echo "e2e job result: $results"
if [ "$results" = '"failure"' ]; then
echo "FAIL: one or more e2e jobs failed"
exit 1
fi
echo "All e2e jobs passed (or were allowed to fail via continue-on-error)"