use crate::config::Config;
use crate::error::MinutesError;
use crate::pipeline::{
build_decode_hints, clean_transcript_line, normalize_space,
normalize_transcript_for_self_name_participant,
};
use crate::transcribe::{self, DecodeHints};
use crate::{ContentType, Result};
use chrono::Utc;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct DecodeHintEvalCase {
pub id: String,
pub audio_path: PathBuf,
#[serde(default)]
pub audio_start_secs: Option<f64>,
#[serde(default)]
pub audio_duration_secs: Option<f64>,
#[serde(default = "default_eval_content_type")]
pub content_type: ContentType,
#[serde(default)]
pub reference_text: String,
#[serde(default)]
pub reference_path: Option<PathBuf>,
#[serde(default)]
pub title: Option<String>,
#[serde(default)]
pub calendar_event_title: Option<String>,
#[serde(default)]
pub pre_context: Option<String>,
#[serde(default)]
pub extra_priority_hints: Vec<String>,
#[serde(default)]
pub extra_context_hints: Vec<String>,
#[serde(default)]
pub vocabulary_entries: Vec<crate::vocabulary::VocabularyEntry>,
#[serde(default)]
pub attendees: Vec<String>,
#[serde(default)]
pub identity_name: Option<String>,
#[serde(default)]
pub identity_aliases: Vec<String>,
#[serde(default)]
pub language: Option<String>,
#[serde(default)]
pub engine: Option<String>,
#[serde(default)]
pub parakeet_boost_score_override: Option<f32>,
#[serde(default)]
pub max_wer_regression: Option<f64>,
#[serde(default)]
pub require_hinted_terms: Vec<String>,
#[serde(default)]
pub forbid_hinted_terms: Vec<String>,
#[serde(default)]
pub allowed_failure_substrings: Vec<String>,
#[serde(default)]
pub disable_identity_hints: bool,
#[serde(default)]
pub disable_attendee_hints: bool,
#[serde(default)]
pub disable_context_hints: bool,
#[serde(default)]
pub disable_extra_priority_hints: bool,
#[serde(default)]
pub disable_extra_context_hints: bool,
#[serde(default)]
pub force_extra_context_hints_for_decode: bool,
}
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalOptions {
#[serde(default)]
pub engine_override: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalTranscriptMetrics {
pub wer: f64,
pub focus_hits: Vec<String>,
pub forbidden_hits: Vec<String>,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub text: String,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalCaseResult {
pub id: String,
pub engine: String,
#[serde(default)]
pub hint_debug: DecodeHintEvalHintDebug,
pub baseline: DecodeHintEvalTranscriptMetrics,
pub candidate: DecodeHintEvalTranscriptMetrics,
pub delta_wer: f64,
pub max_wer_regression: Option<f64>,
pub required_terms: Vec<String>,
pub forbidden_terms: Vec<String>,
pub passed: bool,
#[serde(default = "default_case_status")]
pub status: String,
#[serde(default)]
pub failure_reasons: Vec<String>,
#[serde(default)]
pub allowed_failure_reasons: Vec<String>,
}
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalHintDebug {
#[serde(default)]
pub priority_phrases: Vec<String>,
#[serde(default)]
pub contextual_phrases: Vec<String>,
#[serde(default)]
pub whisper_prompt_phrases: Vec<String>,
#[serde(default)]
pub parakeet_boost_phrases: Vec<String>,
}
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalTotals {
pub cases_total: usize,
pub cases_passed: usize,
pub cases_failed: usize,
#[serde(default)]
pub cases_allowed_failures: usize,
pub improved_cases: usize,
pub regressed_cases: usize,
pub average_delta_wer: f64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalReport {
pub generated_at: String,
pub corpus_path: PathBuf,
pub options: DecodeHintEvalOptions,
pub totals: DecodeHintEvalTotals,
pub cases: Vec<DecodeHintEvalCaseResult>,
pub failure_messages: Vec<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalRequest {
pub command: String,
pub generated_at: String,
pub corpus_path: PathBuf,
pub output_root: PathBuf,
pub git_commit: Option<String>,
pub options: DecodeHintEvalOptions,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalArtifactPaths {
pub run_dir: PathBuf,
pub request_json: PathBuf,
pub results_json: PathBuf,
pub baseline_json: PathBuf,
pub candidate_json: PathBuf,
pub summary_md: PathBuf,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct DecodeHintEvalSidecarReport<'a> {
generated_at: &'a str,
corpus_path: &'a Path,
cases: Vec<DecodeHintEvalSidecarCase>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct DecodeHintEvalSidecarCase {
id: String,
engine: String,
wer: f64,
focus_hits: Vec<String>,
forbidden_hits: Vec<String>,
#[serde(skip_serializing_if = "String::is_empty")]
text: String,
}
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalComparisonTotals {
pub shared_cases: usize,
pub added_cases: usize,
pub removed_cases: usize,
pub improved_cases: usize,
pub regressed_cases: usize,
pub newly_passing_cases: usize,
pub newly_failing_cases: usize,
pub unchanged_cases: usize,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalComparisonCase {
pub id: String,
pub status: String,
pub left_candidate_wer: Option<f64>,
pub right_candidate_wer: Option<f64>,
pub candidate_wer_delta: Option<f64>,
pub left_passed: Option<bool>,
pub right_passed: Option<bool>,
pub gained_focus_hits: Vec<String>,
pub lost_focus_hits: Vec<String>,
pub newly_missing_terms: Vec<String>,
pub resolved_failures: Vec<String>,
#[serde(default)]
pub newly_allowed_failures: Vec<String>,
#[serde(default)]
pub resolved_allowed_failures: Vec<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalComparisonReport {
pub generated_at: String,
pub left_path: PathBuf,
pub right_path: PathBuf,
pub totals: DecodeHintEvalComparisonTotals,
pub cases: Vec<DecodeHintEvalComparisonCase>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalComparisonRequest {
pub command: String,
pub generated_at: String,
pub left_path: PathBuf,
pub right_path: PathBuf,
pub output_root: PathBuf,
}
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintEvalComparisonArtifactPaths {
pub run_dir: PathBuf,
pub request_json: PathBuf,
pub comparison_json: PathBuf,
pub summary_md: PathBuf,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct DecodeHintRunIndexEntry {
pub kind: String,
pub run_dir: PathBuf,
pub generated_at: String,
pub status: String,
pub source_path: PathBuf,
pub cases_total: usize,
pub cases_failed: usize,
pub improved_cases: usize,
pub regressed_cases: usize,
pub newly_passing_cases: usize,
pub newly_failing_cases: usize,
pub summary_path: PathBuf,
}
pub fn default_research_root() -> PathBuf {
dirs::home_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join(".minutes")
.join("research")
.join("decode-hints")
}
pub fn default_comparison_root() -> PathBuf {
dirs::home_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join(".minutes")
.join("research")
.join("decode-hints-comparisons")
}
pub fn list_decode_hint_runs(limit: usize) -> Result<Vec<DecodeHintRunIndexEntry>> {
let entries = collect_decode_hint_runs(&default_research_root(), &default_comparison_root())?;
Ok(entries.into_iter().take(limit).collect())
}
pub fn run_decode_hint_eval_corpus(
corpus_path: &Path,
options: &DecodeHintEvalOptions,
) -> Result<DecodeHintEvalReport> {
let raw = fs::read_to_string(corpus_path)?;
let cases: Vec<DecodeHintEvalCase> = serde_json::from_str(&raw).map_err(invalid_data_error)?;
if cases.is_empty() {
return Err(invalid_input("decode-hint eval corpus is empty"));
}
let mut results = Vec::new();
let mut failure_messages = Vec::new();
let mut delta_sum = 0.0f64;
let mut improved_cases = 0usize;
let mut regressed_cases = 0usize;
let mut allowed_failure_cases = 0usize;
for case in cases {
let mut config = Config::default();
if let Some(language) = &case.language {
config.transcription.language = Some(language.clone());
}
if let Some(engine) = options
.engine_override
.as_ref()
.or(case.engine.as_ref())
.cloned()
{
config.transcription.engine = engine;
}
if let Some(boost_score) = case.parakeet_boost_score_override {
config.transcription.parakeet_boost_score = boost_score;
}
config.identity.name = case.identity_name.clone();
config.identity.aliases = case.identity_aliases.clone();
let reference = eval_text_for_compare(&load_reference_text(&case)?);
let hints = build_eval_case_hints(&case, &config)?;
let hint_debug = DecodeHintEvalHintDebug {
priority_phrases: hints.debug_priority_phrases(),
contextual_phrases: hints.debug_contextual_phrases(),
whisper_prompt_phrases: hints
.debug_priority_phrases()
.into_iter()
.chain(hints.debug_contextual_phrases())
.take(12)
.collect(),
parakeet_boost_phrases: effective_parakeet_boost_phrases(&config, &hints),
};
let baseline = transcribe_case(&case, &config, &DecodeHints::default())?;
let candidate = transcribe_case(&case, &config, &hints)?;
let baseline_text = eval_text_for_compare(&baseline.text);
let candidate_text = eval_text_for_compare(&candidate.text);
let baseline_wer = word_error_rate(&reference, &baseline_text);
let candidate_wer = word_error_rate(&reference, &candidate_text);
let delta_wer = candidate_wer - baseline_wer;
let baseline_focus_hits = present_terms(&baseline_text, &case.require_hinted_terms);
let candidate_focus_hits = present_terms(&candidate_text, &case.require_hinted_terms);
let baseline_forbidden_hits = present_terms(&baseline_text, &case.forbid_hinted_terms);
let candidate_forbidden_hits = present_terms(&candidate_text, &case.forbid_hinted_terms);
delta_sum += delta_wer;
if delta_wer < 0.0 {
improved_cases += 1;
} else if delta_wer > 0.0 {
regressed_cases += 1;
}
let mut case_failures = Vec::new();
if let Some(max_regression) = case.max_wer_regression {
if delta_wer > max_regression {
case_failures.push(format!(
"hinted WER regressed by {:.4} (> {:.4})",
delta_wer, max_regression
));
}
}
for term in &case.require_hinted_terms {
if !candidate_focus_hits
.iter()
.any(|hit| hit.eq_ignore_ascii_case(term))
{
case_failures.push(format!("missing required hinted term '{term}'"));
}
}
for term in &case.forbid_hinted_terms {
if candidate_forbidden_hits
.iter()
.any(|hit| hit.eq_ignore_ascii_case(term))
{
case_failures.push(format!("contains forbidden hinted term '{term}'"));
}
}
let (allowed_failure_reasons, blocking_failure_reasons): (Vec<_>, Vec<_>) =
case_failures.into_iter().partition(|reason| {
case.allowed_failure_substrings
.iter()
.any(|allowed| reason.contains(allowed))
});
for reason in &blocking_failure_reasons {
failure_messages.push(format!("{} {reason}", case.id));
}
let passed = blocking_failure_reasons.is_empty();
let status = if passed {
if allowed_failure_reasons.is_empty() {
"pass"
} else {
allowed_failure_cases += 1;
"allowed-failure"
}
} else {
"fail"
};
results.push(DecodeHintEvalCaseResult {
id: case.id,
engine: config.transcription.engine.clone(),
hint_debug,
baseline: DecodeHintEvalTranscriptMetrics {
wer: baseline_wer,
focus_hits: baseline_focus_hits,
forbidden_hits: baseline_forbidden_hits,
text: baseline_text,
},
candidate: DecodeHintEvalTranscriptMetrics {
wer: candidate_wer,
focus_hits: candidate_focus_hits,
forbidden_hits: candidate_forbidden_hits,
text: candidate_text,
},
delta_wer,
max_wer_regression: case.max_wer_regression,
required_terms: case.require_hinted_terms,
forbidden_terms: case.forbid_hinted_terms,
passed,
status: status.into(),
failure_reasons: blocking_failure_reasons,
allowed_failure_reasons,
});
}
let totals = DecodeHintEvalTotals {
cases_total: results.len(),
cases_passed: results.iter().filter(|case| case.passed).count(),
cases_failed: results.iter().filter(|case| !case.passed).count(),
cases_allowed_failures: allowed_failure_cases,
improved_cases,
regressed_cases,
average_delta_wer: if results.is_empty() {
0.0
} else {
delta_sum / results.len() as f64
},
};
Ok(DecodeHintEvalReport {
generated_at: Utc::now().to_rfc3339(),
corpus_path: corpus_path.to_path_buf(),
options: options.clone(),
totals,
cases: results,
failure_messages,
})
}
pub fn write_decode_hint_eval_artifacts(
request: &DecodeHintEvalRequest,
report: &DecodeHintEvalReport,
) -> Result<DecodeHintEvalArtifactPaths> {
let run_dir = request
.output_root
.join(Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string());
fs::create_dir_all(&run_dir)?;
let request_json = run_dir.join("request.json");
let results_json = run_dir.join("results.json");
let baseline_json = run_dir.join("baseline.json");
let candidate_json = run_dir.join("candidate.json");
let summary_md = run_dir.join("summary.md");
fs::write(
&request_json,
serde_json::to_string_pretty(request).map_err(invalid_data_error)?,
)?;
fs::write(
&results_json,
serde_json::to_string_pretty(report).map_err(invalid_data_error)?,
)?;
let baseline = DecodeHintEvalSidecarReport {
generated_at: &report.generated_at,
corpus_path: &report.corpus_path,
cases: report
.cases
.iter()
.map(|case| DecodeHintEvalSidecarCase {
id: case.id.clone(),
engine: case.engine.clone(),
wer: case.baseline.wer,
focus_hits: case.baseline.focus_hits.clone(),
forbidden_hits: case.baseline.forbidden_hits.clone(),
text: case.baseline.text.clone(),
})
.collect(),
};
fs::write(
&baseline_json,
serde_json::to_string_pretty(&baseline).map_err(invalid_data_error)?,
)?;
let candidate = DecodeHintEvalSidecarReport {
generated_at: &report.generated_at,
corpus_path: &report.corpus_path,
cases: report
.cases
.iter()
.map(|case| DecodeHintEvalSidecarCase {
id: case.id.clone(),
engine: case.engine.clone(),
wer: case.candidate.wer,
focus_hits: case.candidate.focus_hits.clone(),
forbidden_hits: case.candidate.forbidden_hits.clone(),
text: case.candidate.text.clone(),
})
.collect(),
};
fs::write(
&candidate_json,
serde_json::to_string_pretty(&candidate).map_err(invalid_data_error)?,
)?;
fs::write(&summary_md, render_decode_hint_eval_summary(report))?;
Ok(DecodeHintEvalArtifactPaths {
run_dir,
request_json,
results_json,
baseline_json,
candidate_json,
summary_md,
})
}
pub fn load_decode_hint_eval_report(path: &Path) -> Result<DecodeHintEvalReport> {
let resolved = resolve_report_path(path);
let raw = fs::read_to_string(&resolved)?;
serde_json::from_str(&raw).map_err(invalid_data_error)
}
pub fn compare_decode_hint_eval_reports(
left_path: &Path,
right_path: &Path,
) -> Result<DecodeHintEvalComparisonReport> {
let left_resolved = resolve_report_path(left_path);
let right_resolved = resolve_report_path(right_path);
let left = load_decode_hint_eval_report(&left_resolved)?;
let right = load_decode_hint_eval_report(&right_resolved)?;
let left_cases: std::collections::BTreeMap<_, _> = left
.cases
.iter()
.map(|case| (case.id.clone(), case))
.collect();
let right_cases: std::collections::BTreeMap<_, _> = right
.cases
.iter()
.map(|case| (case.id.clone(), case))
.collect();
let mut ids = std::collections::BTreeSet::new();
ids.extend(left_cases.keys().cloned());
ids.extend(right_cases.keys().cloned());
let mut totals = DecodeHintEvalComparisonTotals::default();
let mut cases = Vec::new();
for id in ids {
let left_case = left_cases.get(&id).copied();
let right_case = right_cases.get(&id).copied();
let comparison = match (left_case, right_case) {
(Some(left_case), Some(right_case)) => {
totals.shared_cases += 1;
let delta = right_case.candidate.wer - left_case.candidate.wer;
if delta < 0.0 {
totals.improved_cases += 1;
} else if delta > 0.0 {
totals.regressed_cases += 1;
} else {
totals.unchanged_cases += 1;
}
if !left_case.passed && right_case.passed {
totals.newly_passing_cases += 1;
}
if left_case.passed && !right_case.passed {
totals.newly_failing_cases += 1;
}
let left_hits: std::collections::BTreeSet<_> =
left_case.candidate.focus_hits.iter().cloned().collect();
let right_hits: std::collections::BTreeSet<_> =
right_case.candidate.focus_hits.iter().cloned().collect();
let left_failures: std::collections::BTreeSet<_> =
left_case.failure_reasons.iter().cloned().collect();
let right_failures: std::collections::BTreeSet<_> =
right_case.failure_reasons.iter().cloned().collect();
let left_allowed_failures: std::collections::BTreeSet<_> =
left_case.allowed_failure_reasons.iter().cloned().collect();
let right_allowed_failures: std::collections::BTreeSet<_> =
right_case.allowed_failure_reasons.iter().cloned().collect();
DecodeHintEvalComparisonCase {
id,
status: "shared".into(),
left_candidate_wer: Some(left_case.candidate.wer),
right_candidate_wer: Some(right_case.candidate.wer),
candidate_wer_delta: Some(delta),
left_passed: Some(left_case.passed),
right_passed: Some(right_case.passed),
gained_focus_hits: right_hits.difference(&left_hits).cloned().collect(),
lost_focus_hits: left_hits.difference(&right_hits).cloned().collect(),
newly_missing_terms: right_failures
.difference(&left_failures)
.cloned()
.collect(),
resolved_failures: left_failures.difference(&right_failures).cloned().collect(),
newly_allowed_failures: right_allowed_failures
.difference(&left_allowed_failures)
.cloned()
.collect(),
resolved_allowed_failures: left_allowed_failures
.difference(&right_allowed_failures)
.cloned()
.collect(),
}
}
(None, Some(right_case)) => {
totals.added_cases += 1;
DecodeHintEvalComparisonCase {
id,
status: "added".into(),
left_candidate_wer: None,
right_candidate_wer: Some(right_case.candidate.wer),
candidate_wer_delta: None,
left_passed: None,
right_passed: Some(right_case.passed),
gained_focus_hits: right_case.candidate.focus_hits.clone(),
lost_focus_hits: Vec::new(),
newly_missing_terms: right_case.failure_reasons.clone(),
resolved_failures: Vec::new(),
newly_allowed_failures: right_case.allowed_failure_reasons.clone(),
resolved_allowed_failures: Vec::new(),
}
}
(Some(left_case), None) => {
totals.removed_cases += 1;
DecodeHintEvalComparisonCase {
id,
status: "removed".into(),
left_candidate_wer: Some(left_case.candidate.wer),
right_candidate_wer: None,
candidate_wer_delta: None,
left_passed: Some(left_case.passed),
right_passed: None,
gained_focus_hits: Vec::new(),
lost_focus_hits: left_case.candidate.focus_hits.clone(),
newly_missing_terms: Vec::new(),
resolved_failures: left_case.failure_reasons.clone(),
newly_allowed_failures: Vec::new(),
resolved_allowed_failures: left_case.allowed_failure_reasons.clone(),
}
}
(None, None) => continue,
};
cases.push(comparison);
}
Ok(DecodeHintEvalComparisonReport {
generated_at: Utc::now().to_rfc3339(),
left_path: left_resolved,
right_path: right_resolved,
totals,
cases,
})
}
pub fn write_decode_hint_eval_comparison_artifacts(
request: &DecodeHintEvalComparisonRequest,
report: &DecodeHintEvalComparisonReport,
) -> Result<DecodeHintEvalComparisonArtifactPaths> {
let run_dir = request
.output_root
.join(Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string());
fs::create_dir_all(&run_dir)?;
let request_json = run_dir.join("request.json");
let comparison_json = run_dir.join("comparison.json");
let summary_md = run_dir.join("summary.md");
fs::write(
&request_json,
serde_json::to_string_pretty(request).map_err(invalid_data_error)?,
)?;
fs::write(
&comparison_json,
serde_json::to_string_pretty(report).map_err(invalid_data_error)?,
)?;
fs::write(
&summary_md,
render_decode_hint_eval_comparison_summary(report),
)?;
Ok(DecodeHintEvalComparisonArtifactPaths {
run_dir,
request_json,
comparison_json,
summary_md,
})
}
pub fn render_decode_hint_eval_summary(report: &DecodeHintEvalReport) -> String {
let verdict = if !report.failure_messages.is_empty() {
"FAIL"
} else if report.totals.cases_allowed_failures > 0 {
"PASS WITH ALLOWED FAILURES"
} else {
"PASS"
};
let mut lines = vec![
"# Decode Hint Eval Summary".to_string(),
String::new(),
format!("- Verdict: **{verdict}**"),
format!("- Corpus: `{}`", report.corpus_path.display()),
format!("- Generated at: `{}`", report.generated_at),
format!("- Cases: {}", report.totals.cases_total),
format!("- Passed: {}", report.totals.cases_passed),
format!("- Failed: {}", report.totals.cases_failed),
format!(
"- Allowed failures: {}",
report.totals.cases_allowed_failures
),
format!("- Improved cases: {}", report.totals.improved_cases),
format!("- Regressed cases: {}", report.totals.regressed_cases),
format!(
"- Average candidate-minus-baseline WER delta: `{:.4}`",
report.totals.average_delta_wer
),
String::new(),
"## Case results".to_string(),
String::new(),
];
for case in &report.cases {
lines.push(format!(
"- `{}`: {} [{}] (`{:.4}` -> `{:.4}`, delta `{:.4}`)",
case.id,
if case.passed { "pass" } else { "fail" },
case.status,
case.baseline.wer,
case.candidate.wer,
case.delta_wer
));
if !case.failure_reasons.is_empty() {
lines.push(format!(" reasons: {}", case.failure_reasons.join("; ")));
}
if !case.allowed_failure_reasons.is_empty() {
lines.push(format!(
" allowed failures: {}",
case.allowed_failure_reasons.join("; ")
));
}
}
if !report.failure_messages.is_empty() {
lines.push(String::new());
lines.push("## Failure messages".to_string());
lines.push(String::new());
for failure in &report.failure_messages {
lines.push(format!("- {failure}"));
}
}
lines.join("\n")
}
pub fn render_decode_hint_eval_comparison_summary(
report: &DecodeHintEvalComparisonReport,
) -> String {
let mut lines = vec![
"# Decode Hint Eval Comparison".to_string(),
String::new(),
format!("- Left: `{}`", report.left_path.display()),
format!("- Right: `{}`", report.right_path.display()),
format!("- Generated at: `{}`", report.generated_at),
format!("- Shared cases: {}", report.totals.shared_cases),
format!("- Added cases: {}", report.totals.added_cases),
format!("- Removed cases: {}", report.totals.removed_cases),
format!("- Improved cases: {}", report.totals.improved_cases),
format!("- Regressed cases: {}", report.totals.regressed_cases),
format!("- Newly passing: {}", report.totals.newly_passing_cases),
format!("- Newly failing: {}", report.totals.newly_failing_cases),
String::new(),
"## Case deltas".to_string(),
String::new(),
];
for case in &report.cases {
let headline = match (
case.left_candidate_wer,
case.right_candidate_wer,
case.candidate_wer_delta,
) {
(Some(left), Some(right), Some(delta)) => format!(
"- `{}` [{}]: candidate WER `{:.4}` -> `{:.4}` (delta `{:.4}`)",
case.id, case.status, left, right, delta
),
(None, Some(right), _) => format!(
"- `{}` [{}]: new case on right with candidate WER `{:.4}`",
case.id, case.status, right
),
(Some(left), None, _) => format!(
"- `{}` [{}]: removed case (left candidate WER `{:.4}`)",
case.id, case.status, left
),
_ => format!("- `{}` [{}]", case.id, case.status),
};
lines.push(headline);
if !case.gained_focus_hits.is_empty() {
lines.push(format!(
" gained hits: {}",
case.gained_focus_hits.join(", ")
));
}
if !case.lost_focus_hits.is_empty() {
lines.push(format!(" lost hits: {}", case.lost_focus_hits.join(", ")));
}
if !case.resolved_failures.is_empty() {
lines.push(format!(
" resolved failures: {}",
case.resolved_failures.join("; ")
));
}
if !case.newly_missing_terms.is_empty() {
lines.push(format!(
" new failures: {}",
case.newly_missing_terms.join("; ")
));
}
if !case.newly_allowed_failures.is_empty() {
lines.push(format!(
" new allowed failures: {}",
case.newly_allowed_failures.join("; ")
));
}
if !case.resolved_allowed_failures.is_empty() {
lines.push(format!(
" resolved allowed failures: {}",
case.resolved_allowed_failures.join("; ")
));
}
}
lines.join("\n")
}
fn default_eval_content_type() -> ContentType {
ContentType::Meeting
}
fn default_case_status() -> String {
"pass".into()
}
fn resolve_report_path(path: &Path) -> PathBuf {
if path.is_dir() {
path.join("results.json")
} else {
path.to_path_buf()
}
}
fn collect_decode_hint_runs(
eval_root: &Path,
comparison_root: &Path,
) -> Result<Vec<DecodeHintRunIndexEntry>> {
let mut runs = Vec::new();
runs.extend(collect_eval_runs(eval_root)?);
runs.extend(collect_comparison_runs(comparison_root)?);
runs.sort_by(|left, right| right.generated_at.cmp(&left.generated_at));
Ok(runs)
}
fn eval_run_status(report: &DecodeHintEvalReport) -> &'static str {
if !report.failure_messages.is_empty() {
"fail"
} else if report.totals.cases_allowed_failures > 0 {
"allowed-failure"
} else {
"pass"
}
}
fn comparison_run_status(report: &DecodeHintEvalComparisonReport) -> &'static str {
if report.totals.newly_failing_cases > 0 || report.totals.regressed_cases > 0 {
"mixed"
} else if report.cases.iter().any(|case| {
!case.newly_allowed_failures.is_empty() || !case.resolved_allowed_failures.is_empty()
}) {
"allowed-failure-changed"
} else {
"improved-or-stable"
}
}
fn collect_eval_runs(root: &Path) -> Result<Vec<DecodeHintRunIndexEntry>> {
let mut runs = Vec::new();
if !root.exists() {
return Ok(runs);
}
for entry in fs::read_dir(root)? {
let entry = entry?;
let path = entry.path();
if !entry.file_type()?.is_dir() {
continue;
}
if entry.file_name() == "clips" {
continue;
}
let results_path = path.join("results.json");
let summary_path = path.join("summary.md");
if !results_path.exists() {
continue;
}
let report = load_decode_hint_eval_report(&results_path)?;
let status = eval_run_status(&report).to_string();
runs.push(DecodeHintRunIndexEntry {
kind: "decode-hints".into(),
run_dir: path,
generated_at: report.generated_at,
status,
source_path: report.corpus_path,
cases_total: report.totals.cases_total,
cases_failed: report.totals.cases_failed,
improved_cases: report.totals.improved_cases,
regressed_cases: report.totals.regressed_cases,
newly_passing_cases: 0,
newly_failing_cases: 0,
summary_path,
});
}
Ok(runs)
}
fn collect_comparison_runs(root: &Path) -> Result<Vec<DecodeHintRunIndexEntry>> {
let mut runs = Vec::new();
if !root.exists() {
return Ok(runs);
}
for entry in fs::read_dir(root)? {
let entry = entry?;
let path = entry.path();
if !entry.file_type()?.is_dir() {
continue;
}
let comparison_path = path.join("comparison.json");
let summary_path = path.join("summary.md");
if !comparison_path.exists() {
continue;
}
let raw = fs::read_to_string(&comparison_path)?;
let report: DecodeHintEvalComparisonReport =
serde_json::from_str(&raw).map_err(invalid_data_error)?;
let status = comparison_run_status(&report).to_string();
runs.push(DecodeHintRunIndexEntry {
kind: "decode-hints-comparison".into(),
run_dir: path,
generated_at: report.generated_at,
status,
source_path: report.right_path,
cases_total: report.totals.shared_cases + report.totals.added_cases,
cases_failed: report.totals.newly_failing_cases + report.totals.regressed_cases,
improved_cases: report.totals.improved_cases,
regressed_cases: report.totals.regressed_cases,
newly_passing_cases: report.totals.newly_passing_cases,
newly_failing_cases: report.totals.newly_failing_cases,
summary_path,
});
}
Ok(runs)
}
fn invalid_input(message: &str) -> MinutesError {
MinutesError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
message.to_string(),
))
}
fn invalid_data_error(error: impl std::fmt::Display) -> MinutesError {
MinutesError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
error.to_string(),
))
}
fn build_eval_case_hints(case: &DecodeHintEvalCase, config: &Config) -> Result<DecodeHints> {
let identity = &config.identity;
let identity_for_hints = (!case.disable_identity_hints).then_some(identity);
let attendees = if case.disable_attendee_hints {
&[][..]
} else {
case.attendees.as_slice()
};
let title = (!case.disable_context_hints)
.then_some(case.title.as_deref())
.flatten();
let calendar_event_title = (!case.disable_context_hints)
.then_some(case.calendar_event_title.as_deref())
.flatten();
let pre_context = (!case.disable_context_hints)
.then_some(case.pre_context.as_deref())
.flatten();
let extra_priority_hints = if case.disable_extra_priority_hints {
&[][..]
} else {
case.extra_priority_hints.as_slice()
};
let allow_extra_context_hints = !case.disable_extra_context_hints
&& (config.transcription.engine != "parakeet" || case.force_extra_context_hints_for_decode);
let extra_context_hints = if !allow_extra_context_hints {
&[][..]
} else {
case.extra_context_hints.as_slice()
};
let vocabulary_store = if case.vocabulary_entries.is_empty() {
None
} else {
Some(
crate::vocabulary::VocabularyStore {
entries: case.vocabulary_entries.clone(),
}
.normalized()
.map_err(|error| {
invalid_input(&format!("{} invalid vocabulary entries: {error}", case.id))
})?,
)
};
Ok(build_decode_hints(
title,
calendar_event_title,
pre_context,
attendees,
identity_for_hints,
vocabulary_store.as_ref(),
)
.with_additional_candidates(extra_priority_hints, extra_context_hints))
}
fn transcribe_case(
case: &DecodeHintEvalCase,
config: &Config,
hints: &DecodeHints,
) -> Result<transcribe::TranscribeResult> {
let (_clip_dir, audio_path) = materialize_eval_audio_path(case)?;
let mut result = match case.content_type {
ContentType::Meeting => {
transcribe::transcribe_meeting_with_hints(&audio_path, config, hints)
.map_err(MinutesError::from)
}
_ => transcribe::transcribe_with_hints(&audio_path, config, hints)
.map_err(MinutesError::from),
}?;
if case.content_type == ContentType::Meeting && !hints.is_empty() {
result.text = normalize_transcript_for_self_name_participant(
&result.text,
&case.attendees,
&config.identity,
);
}
Ok(result)
}
fn materialize_eval_audio_path(
case: &DecodeHintEvalCase,
) -> Result<(Option<tempfile::TempDir>, PathBuf)> {
if case.audio_start_secs.is_none() && case.audio_duration_secs.is_none() {
return Ok((None, case.audio_path.clone()));
}
let start_secs = case.audio_start_secs.unwrap_or(0.0);
if !start_secs.is_finite() || start_secs < 0.0 {
return Err(invalid_input(&format!(
"{} audio_start_secs must be finite and non-negative",
case.id
)));
}
let Some(duration_secs) = case.audio_duration_secs else {
return Err(invalid_input(&format!(
"{} audio_duration_secs is required when clipping eval audio",
case.id
)));
};
if !duration_secs.is_finite() || duration_secs <= 0.0 {
return Err(invalid_input(&format!(
"{} audio_duration_secs must be finite and positive",
case.id
)));
}
let dir = tempfile::Builder::new()
.prefix("minutes-decode-hint-eval-")
.tempdir()?;
let output = dir.path().join("clip.wav");
let ffmpeg = Command::new("ffmpeg")
.arg("-hide_banner")
.arg("-loglevel")
.arg("error")
.arg("-y")
.arg("-ss")
.arg(format!("{start_secs:.3}"))
.arg("-t")
.arg(format!("{duration_secs:.3}"))
.arg("-i")
.arg(&case.audio_path)
.arg("-ac")
.arg("1")
.arg("-ar")
.arg("16000")
.arg(&output)
.output();
match ffmpeg {
Ok(result) if result.status.success() => Ok((Some(dir), output)),
Ok(result) => Err(MinutesError::Io(std::io::Error::other(format!(
"{} ffmpeg clip extraction failed: {}",
case.id,
String::from_utf8_lossy(&result.stderr).trim()
)))),
Err(error) => Err(MinutesError::Io(std::io::Error::new(
error.kind(),
format!("{} could not run ffmpeg for eval clip: {}", case.id, error),
))),
}
}
fn eval_text_for_compare(text: &str) -> String {
text.lines()
.filter_map(clean_transcript_line)
.map(|line| normalize_space(&line).to_lowercase())
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join(" ")
}
fn word_error_rate(reference: &str, hypothesis: &str) -> f64 {
let reference_words: Vec<&str> = reference.split_whitespace().collect();
let hypothesis_words: Vec<&str> = hypothesis.split_whitespace().collect();
if reference_words.is_empty() {
return if hypothesis_words.is_empty() {
0.0
} else {
1.0
};
}
let mut dp = vec![vec![0usize; hypothesis_words.len() + 1]; reference_words.len() + 1];
for (i, row) in dp.iter_mut().enumerate().take(reference_words.len() + 1) {
row[0] = i;
}
for (j, cell) in dp[0]
.iter_mut()
.enumerate()
.take(hypothesis_words.len() + 1)
{
*cell = j;
}
for i in 1..=reference_words.len() {
for j in 1..=hypothesis_words.len() {
let cost = usize::from(reference_words[i - 1] != hypothesis_words[j - 1]);
dp[i][j] = (dp[i - 1][j] + 1)
.min(dp[i][j - 1] + 1)
.min(dp[i - 1][j - 1] + cost);
}
}
dp[reference_words.len()][hypothesis_words.len()] as f64 / reference_words.len() as f64
}
fn present_terms(text: &str, terms: &[String]) -> Vec<String> {
let lower = text.to_lowercase();
terms
.iter()
.filter(|term| lower.contains(&term.to_lowercase()))
.cloned()
.collect()
}
fn load_reference_text(case: &DecodeHintEvalCase) -> Result<String> {
if !case.reference_text.trim().is_empty() {
return Ok(case.reference_text.clone());
}
let Some(path) = &case.reference_path else {
return Err(invalid_input(&format!(
"{} missing reference_text/reference_path",
case.id
)));
};
Ok(fs::read_to_string(path)?)
}
#[cfg(feature = "parakeet")]
fn effective_parakeet_boost_phrases(config: &Config, hints: &DecodeHints) -> Vec<String> {
transcribe::combined_parakeet_boost_phrases(config, hints)
}
#[cfg(not(feature = "parakeet"))]
fn effective_parakeet_boost_phrases(_config: &Config, _hints: &DecodeHints) -> Vec<String> {
Vec::new()
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn sample_report() -> DecodeHintEvalReport {
DecodeHintEvalReport {
generated_at: "2026-04-15T12:00:00Z".into(),
corpus_path: PathBuf::from("/tmp/corpus.json"),
options: DecodeHintEvalOptions::default(),
totals: DecodeHintEvalTotals {
cases_total: 1,
cases_passed: 0,
cases_failed: 1,
cases_allowed_failures: 0,
improved_cases: 0,
regressed_cases: 1,
average_delta_wer: 0.031,
},
cases: vec![DecodeHintEvalCaseResult {
id: "case-1".into(),
engine: "parakeet".into(),
hint_debug: DecodeHintEvalHintDebug {
priority_phrases: vec!["Alex Chen".into()],
contextual_phrases: vec!["X1 Planning".into()],
whisper_prompt_phrases: vec!["Alex Chen".into(), "X1 Planning".into()],
parakeet_boost_phrases: vec!["Alex Chen".into(), "X1 Planning".into()],
},
baseline: DecodeHintEvalTranscriptMetrics {
wer: 0.12,
focus_hits: vec!["alex chen".into()],
forbidden_hits: vec![],
text: "alex chen baseline".into(),
},
candidate: DecodeHintEvalTranscriptMetrics {
wer: 0.151,
focus_hits: vec!["alex chen".into()],
forbidden_hits: vec!["matt mullenweg".into()],
text: "alex chen matt mullenweg candidate".into(),
},
delta_wer: 0.031,
max_wer_regression: Some(0.02),
required_terms: vec!["alex chen".into()],
forbidden_terms: vec!["matt mullenweg".into()],
passed: false,
status: "fail".into(),
failure_reasons: vec![
"hinted WER regressed by 0.0310 (> 0.0200)".into(),
"contains forbidden hinted term 'matt mullenweg'".into(),
],
allowed_failure_reasons: vec![],
}],
failure_messages: vec![
"case-1 hinted WER regressed by 0.0310 (> 0.0200)".into(),
"case-1 contains forbidden hinted term 'matt mullenweg'".into(),
],
}
}
fn sample_allowed_failure_report() -> DecodeHintEvalReport {
DecodeHintEvalReport {
generated_at: "2026-04-15T12:00:00Z".into(),
corpus_path: PathBuf::from("/tmp/corpus.json"),
options: DecodeHintEvalOptions::default(),
totals: DecodeHintEvalTotals {
cases_total: 1,
cases_passed: 1,
cases_failed: 0,
cases_allowed_failures: 1,
improved_cases: 0,
regressed_cases: 0,
average_delta_wer: 0.0,
},
cases: vec![DecodeHintEvalCaseResult {
id: "research-case".into(),
engine: "parakeet".into(),
hint_debug: DecodeHintEvalHintDebug::default(),
baseline: DecodeHintEvalTranscriptMetrics {
wer: 0.12,
focus_hits: vec![],
forbidden_hits: vec![],
text: "baseline".into(),
},
candidate: DecodeHintEvalTranscriptMetrics {
wer: 0.12,
focus_hits: vec!["pdf toolkit".into()],
forbidden_hits: vec![],
text: "pdf toolkit candidate".into(),
},
delta_wer: 0.0,
max_wer_regression: Some(0.02),
required_terms: vec!["casey rowan".into()],
forbidden_terms: vec![],
passed: true,
status: "allowed-failure".into(),
failure_reasons: vec![],
allowed_failure_reasons: vec!["missing required hinted term 'casey rowan'".into()],
}],
failure_messages: vec![],
}
}
fn sample_eval_case() -> DecodeHintEvalCase {
DecodeHintEvalCase {
id: "case-1".into(),
audio_path: PathBuf::from("/tmp/audio.wav"),
audio_start_secs: None,
audio_duration_secs: None,
content_type: ContentType::Meeting,
reference_text: "reference".into(),
reference_path: None,
title: Some("X1 / Planning Review".into()),
calendar_event_title: Some("Mat with Alex Chen".into()),
pre_context: Some("Asana migration with Box".into()),
extra_priority_hints: vec!["Casey Rowan".into()],
extra_context_hints: vec!["Northstar Studio".into()],
vocabulary_entries: vec![],
attendees: vec!["mat@example.com".into(), "alex.chen@example.com".into()],
identity_name: Some("Mat".into()),
identity_aliases: vec!["Mathieu".into(), "Matthew".into()],
language: Some("en".into()),
engine: Some("whisper".into()),
parakeet_boost_score_override: None,
max_wer_regression: Some(0.02),
require_hinted_terms: vec!["mat".into()],
forbid_hinted_terms: vec!["matt mullenweg".into()],
allowed_failure_substrings: vec![],
disable_identity_hints: false,
disable_attendee_hints: false,
disable_context_hints: false,
disable_extra_priority_hints: false,
disable_extra_context_hints: false,
force_extra_context_hints_for_decode: false,
}
}
#[test]
fn render_summary_surfaces_failures() {
let summary = render_decode_hint_eval_summary(&sample_report());
assert!(summary.contains("Verdict: **FAIL**"));
assert!(summary.contains("case-1"));
assert!(summary.contains("matt mullenweg"));
}
#[test]
fn render_summary_surfaces_allowed_failure_verdict() {
let summary = render_decode_hint_eval_summary(&sample_allowed_failure_report());
assert!(summary.contains("Verdict: **PASS WITH ALLOWED FAILURES**"));
assert!(summary.contains("Allowed failures: 1"));
assert!(summary.contains("allowed failures: missing required hinted term 'casey rowan'"));
}
#[test]
fn build_eval_case_hints_respects_ablation_flags() {
let mut whisper_config = Config::default();
whisper_config.transcription.engine = "whisper".into();
whisper_config.identity.name = Some("Mat".into());
whisper_config.identity.aliases = vec!["Mathieu".into(), "Matthew".into()];
let full = build_eval_case_hints(&sample_eval_case(), &whisper_config).unwrap();
assert!(full.debug_priority_phrases().iter().any(|v| v == "Mat"));
assert!(full
.debug_priority_phrases()
.iter()
.any(|v| v == "Alex Chen"));
assert!(full
.debug_contextual_phrases()
.iter()
.any(|v| v == "Asana migration"));
assert!(full
.debug_priority_phrases()
.iter()
.any(|v| v == "Casey Rowan"));
assert!(full
.debug_contextual_phrases()
.iter()
.any(|v| v == "Northstar Studio"));
let mut ablated = sample_eval_case();
ablated.disable_identity_hints = true;
ablated.disable_attendee_hints = true;
ablated.disable_context_hints = true;
ablated.disable_extra_priority_hints = true;
ablated.disable_extra_context_hints = true;
let suppressed = build_eval_case_hints(&ablated, &whisper_config).unwrap();
assert!(suppressed.debug_priority_phrases().is_empty());
assert!(suppressed.debug_contextual_phrases().is_empty());
let mut parakeet_config = whisper_config.clone();
parakeet_config.transcription.engine = "parakeet".into();
let parakeet_default =
build_eval_case_hints(&sample_eval_case(), ¶keet_config).unwrap();
assert!(!parakeet_default
.debug_priority_phrases()
.iter()
.any(|v| v == "Northstar Studio"));
assert!(!parakeet_default
.debug_contextual_phrases()
.iter()
.any(|v| v == "Northstar Studio"));
let mut forced = sample_eval_case();
forced.force_extra_context_hints_for_decode = true;
let forced_hints = build_eval_case_hints(&forced, ¶keet_config).unwrap();
assert!(forced_hints
.debug_contextual_phrases()
.iter()
.any(|v| v == "Northstar Studio"));
let mut vocabulary_case = sample_eval_case();
vocabulary_case
.vocabulary_entries
.push(crate::vocabulary::VocabularyEntry {
kind: crate::vocabulary::VocabularyKind::Organization,
canonical: "Automattic".into(),
aliases: vec!["Automatic".into()],
priority: crate::vocabulary::VocabularyPriority::High,
..crate::vocabulary::VocabularyEntry::default()
});
let vocabulary_hints = build_eval_case_hints(&vocabulary_case, &whisper_config).unwrap();
assert!(vocabulary_hints
.debug_priority_phrases()
.iter()
.any(|v| v == "Automattic"));
}
#[test]
fn write_artifacts_creates_expected_files() {
let tmp = TempDir::new().unwrap();
let request = DecodeHintEvalRequest {
command: "minutes autoresearch decode-hints".into(),
generated_at: "2026-04-15T12:00:00Z".into(),
corpus_path: PathBuf::from("/tmp/corpus.json"),
output_root: tmp.path().to_path_buf(),
git_commit: Some("abc123".into()),
options: DecodeHintEvalOptions::default(),
};
let paths = write_decode_hint_eval_artifacts(&request, &sample_report()).unwrap();
assert!(paths.run_dir.exists());
assert!(paths.request_json.exists());
assert!(paths.results_json.exists());
assert!(paths.baseline_json.exists());
assert!(paths.candidate_json.exists());
assert!(paths.summary_md.exists());
}
#[test]
fn compare_reports_surfaces_improvements_and_new_failures() {
let left = sample_report();
let mut right = sample_report();
right.generated_at = "2026-04-16T12:00:00Z".into();
right.cases[0].candidate.wer = 0.10;
right.cases[0].candidate.focus_hits.push("mat".into());
right.cases[0].failure_reasons =
vec!["missing required hinted term 'pdf extension'".into()];
right.cases[0].allowed_failure_reasons =
vec!["missing required hinted term 'casey rowan'".into()];
right.failure_messages = right.cases[0]
.failure_reasons
.iter()
.map(|reason| format!("case-1 {reason}"))
.collect();
let tmp = TempDir::new().unwrap();
let left_path = tmp.path().join("left.json");
let right_path = tmp.path().join("right.json");
fs::write(&left_path, serde_json::to_string_pretty(&left).unwrap()).unwrap();
fs::write(&right_path, serde_json::to_string_pretty(&right).unwrap()).unwrap();
let comparison = compare_decode_hint_eval_reports(&left_path, &right_path).unwrap();
assert_eq!(comparison.totals.shared_cases, 1);
assert_eq!(comparison.totals.improved_cases, 1);
assert_eq!(
comparison.cases[0].gained_focus_hits,
vec!["mat".to_string()]
);
assert!(comparison.cases[0]
.newly_missing_terms
.iter()
.any(|reason| reason.contains("pdf extension")));
assert!(comparison.cases[0]
.resolved_failures
.iter()
.any(|reason| reason.contains("matt mullenweg")));
assert!(comparison.cases[0]
.newly_allowed_failures
.iter()
.any(|reason| reason.contains("casey rowan")));
}
#[test]
fn comparison_summary_surfaces_allowed_failure_transitions() {
let report = DecodeHintEvalComparisonReport {
generated_at: "2026-04-16T13:00:00Z".into(),
left_path: PathBuf::from("/tmp/left.json"),
right_path: PathBuf::from("/tmp/right.json"),
totals: DecodeHintEvalComparisonTotals {
shared_cases: 1,
added_cases: 0,
removed_cases: 0,
improved_cases: 0,
regressed_cases: 0,
newly_passing_cases: 0,
newly_failing_cases: 0,
unchanged_cases: 1,
},
cases: vec![DecodeHintEvalComparisonCase {
id: "external-proper-noun-research".into(),
status: "shared".into(),
left_candidate_wer: Some(0.10),
right_candidate_wer: Some(0.10),
candidate_wer_delta: Some(0.0),
left_passed: Some(true),
right_passed: Some(true),
gained_focus_hits: vec![],
lost_focus_hits: vec![],
newly_missing_terms: vec![],
resolved_failures: vec![],
newly_allowed_failures: vec!["missing required hinted term 'casey rowan'".into()],
resolved_allowed_failures: vec![
"missing required hinted term 'northstar studio'".into()
],
}],
};
let summary = render_decode_hint_eval_comparison_summary(&report);
assert!(
summary.contains("new allowed failures: missing required hinted term 'casey rowan'")
);
assert!(summary.contains(
"resolved allowed failures: missing required hinted term 'northstar studio'"
));
}
#[test]
fn collect_decode_hint_runs_lists_eval_and_comparison_runs() {
let tmp = TempDir::new().unwrap();
let eval_root = tmp.path().join("decode-hints");
let comparison_root = tmp.path().join("decode-hints-comparisons");
fs::create_dir_all(&eval_root).unwrap();
fs::create_dir_all(&comparison_root).unwrap();
let eval_dir = eval_root.join("2026-04-16T00-00-00Z");
fs::create_dir_all(&eval_dir).unwrap();
fs::write(
eval_dir.join("results.json"),
serde_json::to_string_pretty(&sample_report()).unwrap(),
)
.unwrap();
fs::write(eval_dir.join("summary.md"), "# eval").unwrap();
let comparison = DecodeHintEvalComparisonReport {
generated_at: "2026-04-16T13:00:00Z".into(),
left_path: PathBuf::from("/tmp/left.json"),
right_path: PathBuf::from("/tmp/right.json"),
totals: DecodeHintEvalComparisonTotals {
shared_cases: 1,
added_cases: 0,
removed_cases: 0,
improved_cases: 1,
regressed_cases: 0,
newly_passing_cases: 1,
newly_failing_cases: 0,
unchanged_cases: 0,
},
cases: vec![DecodeHintEvalComparisonCase {
id: "external-proper-noun-research".into(),
status: "shared".into(),
left_candidate_wer: Some(0.10),
right_candidate_wer: Some(0.10),
candidate_wer_delta: Some(0.0),
left_passed: Some(true),
right_passed: Some(true),
gained_focus_hits: vec![],
lost_focus_hits: vec![],
newly_missing_terms: vec![],
resolved_failures: vec![],
newly_allowed_failures: vec!["missing required hinted term 'casey rowan'".into()],
resolved_allowed_failures: vec![],
}],
};
let comparison_dir = comparison_root.join("2026-04-16T13-00-00Z");
fs::create_dir_all(&comparison_dir).unwrap();
fs::write(
comparison_dir.join("comparison.json"),
serde_json::to_string_pretty(&comparison).unwrap(),
)
.unwrap();
fs::write(comparison_dir.join("summary.md"), "# comparison").unwrap();
let runs = collect_decode_hint_runs(&eval_root, &comparison_root).unwrap();
assert_eq!(runs.len(), 2);
assert_eq!(runs[0].kind, "decode-hints-comparison");
assert_eq!(runs[0].status, "allowed-failure-changed");
assert_eq!(runs[0].source_path, PathBuf::from("/tmp/right.json"));
assert_eq!(runs[1].kind, "decode-hints");
assert_eq!(runs[1].status, "fail");
assert_eq!(runs[1].source_path, PathBuf::from("/tmp/corpus.json"));
}
#[test]
fn eval_run_status_distinguishes_allowed_failures_from_clean_passes() {
assert_eq!(eval_run_status(&sample_report()), "fail");
assert_eq!(
eval_run_status(&sample_allowed_failure_report()),
"allowed-failure"
);
let clean_pass = DecodeHintEvalReport {
generated_at: "2026-04-15T12:00:00Z".into(),
corpus_path: PathBuf::from("/tmp/corpus.json"),
options: DecodeHintEvalOptions::default(),
totals: DecodeHintEvalTotals {
cases_total: 1,
cases_passed: 1,
cases_failed: 0,
cases_allowed_failures: 0,
improved_cases: 1,
regressed_cases: 0,
average_delta_wer: -0.02,
},
cases: vec![],
failure_messages: vec![],
};
assert_eq!(eval_run_status(&clean_pass), "pass");
}
#[test]
fn checked_in_example_corpus_matches_supported_gate_shape() {
let fixture = include_str!("../../../tests/fixtures/proper-name-eval.example.json");
let cases: Vec<DecodeHintEvalCase> =
serde_json::from_str(fixture).expect("example corpus should parse");
assert_eq!(cases.len(), 3, "keep the public starter corpus intentional");
assert!(
cases.iter().any(|case| case.id == "self-intro-parakeet"),
"starter corpus should include a parakeet self-name case"
);
assert!(
cases.iter().any(|case| case.id == "self-intro-whisper"),
"starter corpus should include a whisper self-name case"
);
let research_case = cases
.iter()
.find(|case| case.id == "external-proper-noun-research")
.expect("starter corpus should include the external proper-noun research case");
assert!(
!research_case.allowed_failure_substrings.is_empty(),
"external proper-noun example should stay explicitly scoped as research"
);
}
}