Skip to main content

trueno_rag/eval/
mod.rs

1//! Evaluation framework for RAG retrieval quality (PMAT-015)
2//!
3//! World-class RAG evaluation using LLM-as-judge on actual chunk content
4//! and synthetic ground truth generated from the corpus itself.
5//!
6//! # Architecture
7//!
8//! Split pipeline — trueno-rag handles data, Claude Code handles LLM work:
9//! - `eval sample` — Sample chunks from index (no API needed)
10//! - `eval retrieve` — Run queries against index (no API needed)
11//! - `eval metrics` — Compute IR metrics from judgments (no API needed)
12//! - Claude Code `/eval-generate` skill — Generate questions from sampled chunks
13//! - Claude Code `/eval-judge` skill — Judge relevance of retrieved chunks
14//!
15//! Optional direct API mode (requires `ANTHROPIC_API_KEY`):
16//! - `eval generate` — Sample + generate questions via Claude API
17//! - `eval judge` — Judge + compute metrics via Claude API
18
19pub mod client;
20pub mod domain;
21pub mod generate;
22pub mod judge;
23pub mod metrics;
24pub mod types;
25
26pub use client::AnthropicClient;
27pub use domain::classify_domain;
28pub use generate::GroundTruthGenerator;
29pub use judge::RelevanceJudge;
30pub use metrics::compute_metrics_from_judgments;
31pub use types::{
32    EvalConfig, GroundTruthEntry, JudgeCache, JudgeCacheEntry, JudgeVerdict, JudgmentEntry,
33    RetrievalResultEntry,
34};