use std::collections::{BTreeMap, BTreeSet};
use std::error::Error;
use std::fmt;
use std::fmt::Write as _;
use std::fs;
use std::path::{Path, PathBuf};
use std::time::Instant;
use serde_json::Value;
use crate::{
emit_gbnf, evaluate, infer_cfg, Grammar, GrammarExpr, GrammarFormat, GrammarOracle,
GrammarRule, InferenceOptions, MetricScores, PositiveOnlyOracle, SampleConfig, ScoringMode,
};
pub const DEFAULT_CORPUS_MANIFEST: &str = "benches/corpus-manifest.json";
pub const DEFAULT_CORPORA_ROOT: &str = "benches/corpora";
pub const PUBLISHED_TREEVADA_AVG_F1: f64 = 0.32;
pub const PUBLISHED_NATGI_AVG_F1: f64 = 0.57;
pub const D5_REQUIRED_AVG_F1: f64 = PUBLISHED_NATGI_AVG_F1;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CompetitorManifest {
pub schema: u64,
pub entries: Vec<CorpusManifestEntry>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CorpusManifestEntry {
pub tool: String,
pub subject: String,
pub source: String,
pub commit: String,
pub license: String,
pub files: usize,
pub bytes: u64,
pub included: bool,
pub exclude_reason: String,
pub example_paths: Vec<String>,
pub golden: String,
}
impl CorpusManifestEntry {
#[must_use]
pub fn id(&self) -> String {
format!("{}/{}", self.tool, self.subject)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct SkippedCorpus {
pub id: String,
pub reason: String,
}
#[derive(Clone, Debug, PartialEq)]
pub struct CompetitorRun {
pub id: String,
pub tool: String,
pub subject: String,
pub examples: usize,
pub scores: MetricScores,
pub samples_drawn: usize,
pub seed: u64,
pub scoring_mode: ScoringMode,
pub wall_clock_ms: u128,
pub inferred_rules: usize,
pub required_f1: f64,
pub gbnf_emitted: bool,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct SecondaryMetricRow {
pub metric: &'static str,
pub value: String,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct BenchmarkFailure {
pub corpus: Option<String>,
pub message: String,
}
impl BenchmarkFailure {
fn new(corpus: Option<String>, message: impl Into<String>) -> Self {
Self {
corpus,
message: message.into(),
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct CompetitorSuiteReport {
pub seed: u64,
pub runs: Vec<CompetitorRun>,
pub skipped: Vec<SkippedCorpus>,
pub secondary: Vec<SecondaryMetricRow>,
pub failures: Vec<BenchmarkFailure>,
}
#[derive(Debug)]
pub enum BenchmarkError {
Io {
path: PathBuf,
source: std::io::Error,
},
Json {
path: PathBuf,
source: serde_json::Error,
},
Manifest(String),
}
impl fmt::Display for BenchmarkError {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io { path, source } => {
write!(formatter, "{}: {source}", path.display())
}
Self::Json { path, source } => {
write!(formatter, "{}: {source}", path.display())
}
Self::Manifest(message) => formatter.write_str(message),
}
}
}
impl Error for BenchmarkError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
Self::Io { source, .. } => Some(source),
Self::Json { source, .. } => Some(source),
Self::Manifest(_) => None,
}
}
}
pub fn run_competitor_suite(
config: &SampleConfig,
) -> Result<CompetitorSuiteReport, BenchmarkError> {
run_competitor_suite_from_paths(
Path::new(DEFAULT_CORPUS_MANIFEST),
Path::new(DEFAULT_CORPORA_ROOT),
config,
)
}
pub fn run_competitor_suite_from_paths(
manifest_path: &Path,
corpora_root: &Path,
config: &SampleConfig,
) -> Result<CompetitorSuiteReport, BenchmarkError> {
let manifest = load_manifest(manifest_path)?;
let mut report = validate_manifest(&manifest, corpora_root)?;
report.seed = config.seed;
for entry in manifest.entries.iter().filter(|entry| entry.included) {
match run_manifest_entry(entry, corpora_root, config) {
Ok(run) => {
if run.scores.f1 < run.required_f1 {
report.failures.push(BenchmarkFailure::new(
Some(run.id.clone()),
format!(
"F1 {:.3} is below required NatGI bar {:.3}",
run.scores.f1, run.required_f1
),
));
}
report.runs.push(run);
}
Err(message) => {
report
.failures
.push(BenchmarkFailure::new(Some(entry.id()), message));
}
}
}
if report.runs.is_empty() {
report.failures.push(BenchmarkFailure::new(
None,
"manifest includes no always-on competitor corpus subjects",
));
}
report.secondary = secondary_rows(&report.runs);
Ok(report)
}
pub fn load_manifest(path: &Path) -> Result<CompetitorManifest, BenchmarkError> {
let text = fs::read_to_string(path).map_err(|source| BenchmarkError::Io {
path: path.to_path_buf(),
source,
})?;
let value = serde_json::from_str::<Value>(&text).map_err(|source| BenchmarkError::Json {
path: path.to_path_buf(),
source,
})?;
parse_manifest(&value)
}
#[must_use]
pub fn render_competitor_report(report: &CompetitorSuiteReport) -> String {
let mut output = String::new();
let _ = writeln!(
output,
"D1/D5 competitor benchmark suite (seed {})",
report.seed
);
let _ = writeln!(
output,
"Published bars: TreeVada avg F1 ~= {PUBLISHED_TREEVADA_AVG_F1:.2}; NatGI avg F1 ~= {PUBLISHED_NATGI_AVG_F1:.2}"
);
if !report.skipped.is_empty() {
let _ = writeln!(output);
for skipped in &report.skipped {
let _ = writeln!(output, "SKIPPED {}: {}", skipped.id, skipped.reason);
}
}
let _ = writeln!(output);
let _ = writeln!(
output,
"| corpus | examples | precision | recall | F1 | required F1 | wall-clock ms | rules | samples |"
);
let _ = writeln!(output, "|---|---:|---:|---:|---:|---:|---:|---:|---:|");
for run in &report.runs {
let _ = writeln!(
output,
"| {} | {} | {:.3} | {:.3} | {:.3} | {:.3} | {} | {} | {} |",
run.id,
run.examples,
run.scores.precision,
run.scores.recall,
run.scores.f1,
run.required_f1,
run.wall_clock_ms,
run.inferred_rules,
run.samples_drawn
);
}
let _ = writeln!(output);
let _ = writeln!(output, "| secondary metric | value |");
let _ = writeln!(output, "|---|---|");
for row in &report.secondary {
let _ = writeln!(output, "| {} | {} |", row.metric, row.value);
}
if !report.failures.is_empty() {
let _ = writeln!(output);
let _ = writeln!(output, "Failures:");
for failure in &report.failures {
match &failure.corpus {
Some(corpus) => {
let _ = writeln!(output, "- {corpus}: {}", failure.message);
}
None => {
let _ = writeln!(output, "- {}", failure.message);
}
}
}
}
output
}
fn parse_manifest(value: &Value) -> Result<CompetitorManifest, BenchmarkError> {
let object = value
.as_object()
.ok_or_else(|| BenchmarkError::Manifest("manifest root must be a JSON object".into()))?;
let schema = required_u64(object, "schema")?;
let corpus = object
.get("corpus")
.and_then(Value::as_array)
.ok_or_else(|| BenchmarkError::Manifest("manifest corpus must be an array".into()))?;
let mut entries = Vec::with_capacity(corpus.len());
for (index, entry) in corpus.iter().enumerate() {
let object = entry.as_object().ok_or_else(|| {
BenchmarkError::Manifest(format!("manifest corpus[{index}] must be an object"))
})?;
entries.push(CorpusManifestEntry {
tool: required_string(object, "tool")?,
subject: required_string(object, "subject")?,
source: required_string(object, "source")?,
commit: required_string(object, "commit")?,
license: required_string(object, "license")?,
files: usize::try_from(required_u64(object, "files")?).map_err(|_| {
BenchmarkError::Manifest(format!("manifest corpus[{index}].files overflows usize"))
})?,
bytes: required_u64(object, "bytes")?,
included: required_bool(object, "included")?,
exclude_reason: required_string(object, "exclude_reason")?,
example_paths: required_string_array(object, "example_paths")?,
golden: required_string(object, "golden")?,
});
}
Ok(CompetitorManifest { schema, entries })
}
fn required_string(
object: &serde_json::Map<String, Value>,
key: &str,
) -> Result<String, BenchmarkError> {
object
.get(key)
.and_then(Value::as_str)
.map(ToOwned::to_owned)
.ok_or_else(|| BenchmarkError::Manifest(format!("manifest field `{key}` must be a string")))
}
fn required_u64(object: &serde_json::Map<String, Value>, key: &str) -> Result<u64, BenchmarkError> {
object
.get(key)
.and_then(Value::as_u64)
.ok_or_else(|| BenchmarkError::Manifest(format!("manifest field `{key}` must be a u64")))
}
fn required_bool(
object: &serde_json::Map<String, Value>,
key: &str,
) -> Result<bool, BenchmarkError> {
object
.get(key)
.and_then(Value::as_bool)
.ok_or_else(|| BenchmarkError::Manifest(format!("manifest field `{key}` must be a bool")))
}
fn required_string_array(
object: &serde_json::Map<String, Value>,
key: &str,
) -> Result<Vec<String>, BenchmarkError> {
object
.get(key)
.and_then(Value::as_array)
.ok_or_else(|| {
BenchmarkError::Manifest(format!("manifest field `{key}` must be an array"))
})?
.iter()
.map(|value| {
value.as_str().map(ToOwned::to_owned).ok_or_else(|| {
BenchmarkError::Manifest(format!("manifest field `{key}` must contain strings"))
})
})
.collect()
}
fn validate_manifest(
manifest: &CompetitorManifest,
corpora_root: &Path,
) -> Result<CompetitorSuiteReport, BenchmarkError> {
let mut report = CompetitorSuiteReport {
seed: 0,
runs: Vec::new(),
skipped: Vec::new(),
secondary: Vec::new(),
failures: Vec::new(),
};
let mut entries = BTreeMap::<String, &CorpusManifestEntry>::new();
for entry in &manifest.entries {
let id = entry.id();
if entries.insert(id.clone(), entry).is_some() {
report.failures.push(BenchmarkFailure::new(
Some(id.clone()),
"duplicate manifest entry",
));
}
if !entry.included {
if entry.exclude_reason.trim().is_empty() {
report.failures.push(BenchmarkFailure::new(
Some(id.clone()),
"excluded corpus must provide a non-empty exclude_reason",
));
} else {
report.skipped.push(SkippedCorpus {
id: id.clone(),
reason: entry.exclude_reason.clone(),
});
}
}
let subject_dir = corpora_root.join(&entry.tool).join(&entry.subject);
if !subject_dir.is_dir() {
report.failures.push(BenchmarkFailure::new(
Some(id.clone()),
format!(
"vendored subject directory {} is missing",
subject_dir.display()
),
));
continue;
}
let (files, bytes) =
count_subject_files(&subject_dir).map_err(|source| BenchmarkError::Io {
path: subject_dir.clone(),
source,
})?;
if files != entry.files {
report.failures.push(BenchmarkFailure::new(
Some(id.clone()),
format!("manifest files={} but vendored files={files}", entry.files),
));
}
if bytes != entry.bytes {
report.failures.push(BenchmarkFailure::new(
Some(id.clone()),
format!("manifest bytes={} but vendored bytes={bytes}", entry.bytes),
));
}
if entry.included && entry.example_paths.is_empty() {
report.failures.push(BenchmarkFailure::new(
Some(id),
"included corpus must list at least one example path",
));
}
}
for subject in vendored_subjects(corpora_root)? {
if !entries.contains_key(&subject) {
report.failures.push(BenchmarkFailure::new(
Some(subject),
"vendored subject is missing from manifest",
));
}
}
Ok(report)
}
fn run_manifest_entry(
entry: &CorpusManifestEntry,
corpora_root: &Path,
config: &SampleConfig,
) -> Result<CompetitorRun, String> {
if entry.golden != "exact_examples" {
return Err(format!(
"unsupported included golden oracle mode `{}`",
entry.golden
));
}
let id = entry.id();
let subject_dir = corpora_root.join(&entry.tool).join(&entry.subject);
let examples = load_examples(&subject_dir, &entry.example_paths)
.map_err(|error| format!("failed to load examples: {error}"))?;
if examples.is_empty() {
return Err("included corpus loaded no positive examples".into());
}
let start = Instant::now();
let inferred = infer_cfg(&examples, &PositiveOnlyOracle, InferenceOptions::default());
let golden = exact_examples_grammar(&examples);
let oracle = GrammarOracle::new(&golden);
let positive_refs = examples.iter().map(String::as_str).collect::<Vec<_>>();
let scores = evaluate(
&inferred.grammar,
&oracle,
Some(&golden),
&positive_refs,
config,
)
.map_err(|error| format!("D1 evaluation failed: {error}"))?;
let samples_drawn = sample_count(&inferred.grammar, &golden, config)
.map_err(|error| format!("D1 sample accounting failed: {error}"))?;
let gbnf_emitted = emit_gbnf(&inferred.grammar).is_ok_and(|(text, _)| !text.trim().is_empty());
Ok(CompetitorRun {
id,
tool: entry.tool.clone(),
subject: entry.subject.clone(),
examples: examples.len(),
scores,
samples_drawn,
seed: config.seed,
scoring_mode: ScoringMode::GoldenGrammar,
wall_clock_ms: start.elapsed().as_millis(),
inferred_rules: inferred.report.rules,
required_f1: D5_REQUIRED_AVG_F1,
gbnf_emitted,
})
}
fn sample_count(
inferred: &Grammar,
golden: &Grammar,
config: &SampleConfig,
) -> Result<usize, crate::EvalError> {
let inferred_count = crate::sample(inferred, config)?.len();
let golden_count = crate::sample(golden, config)?.len();
Ok(inferred_count.saturating_add(golden_count))
}
fn exact_examples_grammar(examples: &[String]) -> Grammar {
let alternatives = examples.iter().map(|example| {
if example.is_empty() {
GrammarExpr::Empty
} else {
GrammarExpr::Terminal(example.clone())
}
});
Grammar::new()
.with_source_format(GrammarFormat::Inferred)
.with_rule(GrammarRule::new(
"Root",
finish_choice(alternatives.collect()),
))
.with_start("Root")
}
fn finish_choice(alternatives: Vec<GrammarExpr>) -> GrammarExpr {
let mut unique = BTreeMap::<String, GrammarExpr>::new();
for alternative in alternatives {
unique
.entry(format!("{alternative:?}"))
.or_insert(alternative);
}
match unique.len() {
0 => GrammarExpr::Empty,
1 => unique
.into_values()
.next()
.expect("one choice alternative must exist"),
_ => GrammarExpr::Choice {
ordered: false,
alternatives: unique.into_values().collect(),
},
}
}
fn load_examples(subject_dir: &Path, example_paths: &[String]) -> Result<Vec<String>, String> {
let mut files = Vec::new();
for relative in example_paths {
let path = subject_dir.join(relative);
if path.is_file() {
files.push(path);
} else if path.is_dir() {
collect_files(&path, &mut files)
.map_err(|error| format!("{}: {error}", path.display()))?;
} else {
return Err(format!("example path {} is missing", path.display()));
}
}
files.sort();
files.dedup();
files
.iter()
.map(|path| {
fs::read_to_string(path).map_err(|error| format!("{}: {error}", path.display()))
})
.collect()
}
fn count_subject_files(path: &Path) -> Result<(usize, u64), std::io::Error> {
let mut files = Vec::new();
collect_files(path, &mut files)?;
let mut bytes = 0u64;
for file in &files {
bytes = bytes.saturating_add(fs::metadata(file)?.len());
}
Ok((files.len(), bytes))
}
fn collect_files(path: &Path, files: &mut Vec<PathBuf>) -> Result<(), std::io::Error> {
for entry in fs::read_dir(path)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
collect_files(&path, files)?;
} else if path.is_file() {
files.push(path);
}
}
Ok(())
}
fn vendored_subjects(corpora_root: &Path) -> Result<BTreeSet<String>, BenchmarkError> {
let mut subjects = BTreeSet::new();
let tools = fs::read_dir(corpora_root).map_err(|source| BenchmarkError::Io {
path: corpora_root.to_path_buf(),
source,
})?;
for tool in tools {
let tool = tool.map_err(|source| BenchmarkError::Io {
path: corpora_root.to_path_buf(),
source,
})?;
let tool_path = tool.path();
if !tool_path.is_dir() {
continue;
}
let tool_name = tool.file_name().to_string_lossy().into_owned();
for subject in fs::read_dir(&tool_path).map_err(|source| BenchmarkError::Io {
path: tool_path.clone(),
source,
})? {
let subject = subject.map_err(|source| BenchmarkError::Io {
path: tool_path.clone(),
source,
})?;
let subject_path = subject.path();
if subject_path.is_dir() {
let subject_name = subject.file_name().to_string_lossy().into_owned();
subjects.insert(format!("{tool_name}/{subject_name}"));
}
}
}
Ok(subjects)
}
fn secondary_rows(runs: &[CompetitorRun]) -> Vec<SecondaryMetricRow> {
let gbnf_successes = runs.iter().filter(|run| run.gbnf_emitted).count();
vec![
SecondaryMetricRow {
metric: "format coverage",
value: "n/a (pending B*/C* cross-format coverage aggregation)".to_string(),
},
SecondaryMetricRow {
metric: "round-trip fidelity",
value: "n/a (pending F2 fidelity matrix)".to_string(),
},
SecondaryMetricRow {
metric: "GBNF emit",
value: format!(
"{gbnf_successes}/{} included grammars emitted non-empty GBNF",
runs.len()
),
},
SecondaryMetricRow {
metric: "cross-language translation",
value: "n/a (pending C6 full metric wiring)".to_string(),
},
]
}