use std::path::PathBuf;
use std::process::Command;
use assert_cmd::cargo::CommandCargoExt;
use serde::Deserialize;
#[derive(Clone, Copy, PartialEq, Eq)]
enum Difficulty {
Easy,
Medium,
Hard,
}
use Difficulty::{Easy, Hard, Medium};
struct EvalQuery {
query: &'static str,
expected: &'static str,
difficulty: Difficulty,
description: &'static str,
}
static EVAL_QUERIES: &[EvalQuery] = &[
EvalQuery {
query: "API versioning",
expected: "api-design",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "Series A fundraising",
expected: "fundraising",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "CAP theorem",
expected: "distributed-systems",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "overfitting machine learning",
expected: "machine-learning",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "remote work VPN",
expected: "remote-work",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "Project Phoenix retrospective",
expected: "product-launch",
difficulty: Easy,
description: "Direct keyword match",
},
EvalQuery {
query: "how to structure REST endpoints",
expected: "api-design",
difficulty: Medium,
description: "Conceptual - no exact match",
},
EvalQuery {
query: "raising money for startup",
expected: "fundraising",
difficulty: Medium,
description: "Conceptual - synonyms",
},
EvalQuery {
query: "consistency vs availability tradeoffs",
expected: "distributed-systems",
difficulty: Medium,
description: "Conceptual understanding",
},
EvalQuery {
query: "how to prevent models from memorizing data",
expected: "machine-learning",
difficulty: Medium,
description: "Conceptual - overfitting",
},
EvalQuery {
query: "working from home guidelines",
expected: "remote-work",
difficulty: Medium,
description: "Synonym match",
},
EvalQuery {
query: "what went wrong with the launch",
expected: "product-launch",
difficulty: Medium,
description: "Conceptual query",
},
EvalQuery {
query: "nouns not verbs",
expected: "api-design",
difficulty: Hard,
description: "Partial phrase recall",
},
EvalQuery {
query: "Sequoia investor pitch",
expected: "fundraising",
difficulty: Hard,
description: "Indirect reference",
},
EvalQuery {
query: "Raft algorithm leader election",
expected: "distributed-systems",
difficulty: Hard,
description: "Specific detail in long doc",
},
EvalQuery {
query: "F1 score precision recall",
expected: "machine-learning",
difficulty: Hard,
description: "Technical detail",
},
EvalQuery {
query: "quarterly team gathering travel",
expected: "remote-work",
difficulty: Hard,
description: "Specific policy detail",
},
EvalQuery {
query: "beta program 47 bugs",
expected: "product-launch",
difficulty: Hard,
description: "Specific number recall",
},
];
fn difficulty_label(d: Difficulty) -> &'static str {
match d {
Easy => "easy",
Medium => "medium",
Hard => "hard",
}
}
fn matches_expected(file: &str, expected: &str) -> bool {
file.to_lowercase().contains(expected)
}
#[derive(Deserialize)]
struct EvalHit {
file: String,
}
struct Out {
stdout: String,
code: i32,
stderr: String,
}
struct Harness {
root: PathBuf,
db: PathBuf,
cfg: PathBuf,
}
impl Harness {
fn run(&self, args: &[&str]) -> Out {
let mut full: Vec<&str> = vec!["--index", "index"];
full.extend_from_slice(args);
let mut cmd = Command::cargo_bin("rqmd").expect("rqmd binary is built by cargo test");
cmd.current_dir(&self.root)
.env_remove("CI")
.env_remove("XDG_CONFIG_HOME")
.env("NO_COLOR", "1")
.env("PWD", &self.root)
.env("RQMD_INDEX_PATH", &self.db)
.env("RQMD_CONFIG_DIR", &self.cfg)
.args(&full);
let out = cmd.output().expect("spawn rqmd");
Out {
stdout: String::from_utf8_lossy(&out.stdout).into_owned(),
stderr: String::from_utf8_lossy(&out.stderr).into_owned(),
code: out.status.code().unwrap_or(-1),
}
}
fn search(&self, mode: &str, query: &str) -> Vec<String> {
let out = self.run(&[mode, query, "--json", "-n", "5"]);
if out.code != 0 {
return Vec::new();
}
let hits: Vec<EvalHit> = serde_json::from_str(&out.stdout).unwrap_or_default();
hits.into_iter().map(|h| h.file).collect()
}
fn evaluate(&self, mode: &str) {
eprintln!("\n=== Evaluating {} mode ===\n", mode.to_uppercase());
let mut buckets = [Bucket::default(); 3]; for q in EVAL_QUERIES {
let hits = self.search(mode, q.query);
let first_hit = hits
.iter()
.position(|f| matches_expected(f, q.expected))
.map(|i| i + 1);
let b = &mut buckets[bucket_index(q.difficulty)];
b.total += 1;
if first_hit == Some(1) {
b.hit1 += 1;
}
if matches!(first_hit, Some(n) if (1..=3).contains(&n)) {
b.hit3 += 1;
}
if matches!(first_hit, Some(n) if (1..=5).contains(&n)) {
b.hit5 += 1;
}
let status = match first_hit {
Some(1) => "✓".to_string(),
Some(n) => format!("@{n}"),
None => "✗".to_string(),
};
eprintln!(
"[{:<6}] {:<3} \"{}\" → {}",
difficulty_label(q.difficulty),
status,
q.query,
q.description
);
}
eprintln!("\n--- Summary ---");
for (i, label) in ["easy", "medium", "hard"].iter().enumerate() {
let b = buckets[i];
eprintln!(
"{:<8}: Hit@1={}% Hit@3={}% Hit@5={}% (n={})",
label,
pct(b.hit1, b.total),
pct(b.hit3, b.total),
pct(b.hit5, b.total),
b.total
);
}
let total = EVAL_QUERIES.len();
let total_hit1: usize = buckets.iter().map(|b| b.hit1).sum();
let total_hit3: usize = buckets.iter().map(|b| b.hit3).sum();
eprintln!(
"\nOverall: Hit@1={}% Hit@3={}%",
pct(total_hit1, total),
pct(total_hit3, total)
);
}
}
#[derive(Default, Clone, Copy)]
struct Bucket {
total: usize,
hit1: usize,
hit3: usize,
hit5: usize,
}
fn bucket_index(d: Difficulty) -> usize {
match d {
Easy => 0,
Medium => 1,
Hard => 2,
}
}
fn pct(hit: usize, total: usize) -> i64 {
if total == 0 {
return 0;
}
((hit as f64 / total as f64) * 100.0).round() as i64
}
fn skip_llm() -> bool {
std::env::var("RQMD_SKIP_LLM_TESTS").is_ok()
}
#[test]
fn eval_harness() {
if skip_llm() {
eprintln!("RQMD_SKIP_LLM_TESTS set — skipping eval-harness suite (needs models)");
return;
}
let tmp = tempfile::tempdir().expect("mkdtemp");
let h = Harness {
root: tmp.path().to_path_buf(),
db: tmp.path().join("index.sqlite"),
cfg: tmp.path().join("config"),
};
let docs_dir = h.root.join("eval-docs");
std::fs::create_dir_all(&h.cfg).unwrap();
std::fs::create_dir_all(&docs_dir).unwrap();
std::fs::write(h.cfg.join("index.yml"), "collections: {}\n").unwrap();
let src = concat!(env!("CARGO_MANIFEST_DIR"), "/../rqmd-core/tests/eval-docs");
for entry in std::fs::read_dir(src).expect("read eval-docs source dir") {
let path = entry.unwrap().path();
if path.extension().and_then(|s| s.to_str()) == Some("md") {
let name = path.file_name().unwrap();
std::fs::copy(&path, docs_dir.join(name)).unwrap();
}
}
let docs_str = docs_dir.to_str().unwrap();
let add = h.run(&["collection", "add", docs_str, "--name", "eval-docs"]);
assert_eq!(
add.code, 0,
"collection add failed (exit {})\n--- stdout ---\n{}\n--- stderr ---\n{}",
add.code, add.stdout, add.stderr
);
let emb = h.run(&["embed"]);
assert_eq!(
emb.code, 0,
"embed failed (exit {})\n--- stdout ---\n{}\n--- stderr ---\n{}",
emb.code, emb.stdout, emb.stderr
);
eprintln!("rqmd Evaluation Harness");
eprintln!("{}", "=".repeat(50));
eprintln!("Testing {} queries across 6 documents", EVAL_QUERIES.len());
h.evaluate("search");
h.evaluate("query");
}